Browse list in python and get data

Browse list in python and get data - python

I want to get the total number of views of a serie of videos on Youtube about a concrete topic.
I have this code, but it doesn't work because I am not getting the correct data from the list "videos". How can I browse this list and get the total views of the complete list?
#!/usr/bin/python
import urllib2
from apiclient.discovery import build
from apiclient.errors import HttpError
from oauth2client.tools import argparser
# Set DEVELOPER_KEY to the API key value from the APIs & auth > Registered apps
# tab of
# https://cloud.google.com/console
# Please ensure that you have enabled the YouTube Data API for your project.
DEVELOPER_KEY = "AIzaSyB..........I"
YOUTUBE_API_SERVICE_NAME = "youtube"
YOUTUBE_API_VERSION = "v3"
def youtube_search(options):
youtube = build(YOUTUBE_API_SERVICE_NAME, YOUTUBE_API_VERSION,
developerKey=DEVELOPER_KEY)
# Call the search.list method to retrieve results matching the specified
# query term.
search_response = youtube.search().list(
q=options.q,
part="id,snippet",
maxResults=options.max_results
).execute()
videos = []
channels = []
playlists = []
# Add each result to the appropriate list, and then display the lists of
# matching videos, channels, and playlists.
for search_result in search_response.get("items", []):
if search_result["id"]["kind"] == "youtube#video":
videos.append("%s (%s)" % (search_result["snippet"]["title"],
search_result["id"]["videoId"]))
elif search_result["id"]["kind"] == "youtube#channel":
channels.append("%s (%s)" % (search_result["snippet"]["title"],
search_result["id"]["channelId"]))
elif search_result["id"]["kind"] == "youtube#playlist":
playlists.append("%s (%s)" % (search_result["snippet"]["title"],
search_result["id"]["playlistId"]))
print "Videos:\n", "\n".join(videos), "\n"
###################################
###################################
###HERE IS THE PROBLEM##########
###################################
for video in videos:
source = video
response = urllib2.urlopen(source)
html = response.read() #Done, you have the whole HTML file in a gigantic string
wordBreak = ['<','>']
html = list(html)
i = 0
while i < len(html):
if html[i] in wordBreak:
html[i] = ' '
i += 1
#The block above is just to make the html.split() easier.
html = ''.join(html)
html = html.split()
dataSwitch = False
numOfViews = ''
for element in html:
if element == '/div':
dataSwitch = False
if dataSwitch:
numOfViews += str(element)
if element == 'class="watch-view-count"':
dataSwitch = True
print (numOfViews)
########################################
########################################
#######################################
print "Channels:\n", "\n".join(channels), "\n"
print "Playlists:\n", "\n".join(playlists), "\n"
if __name__ == "__main__":
argparser.add_argument("--q", help="Search term", default="la la land")
argparser.add_argument("--max-results", help="Max results", default=25)
args = argparser.parse_args()
try:
youtube_search(args)
except HttpError, e:
print "An HTTP error %d occurred:\n%s" % (e.resp.status, e.content)

Related

requests_html stop website from redirecting

I am trying to scrape the follow link https://9anime.to/watch/one-piece-dub.34r/r2wjlq using python/requests_html.
My problem is it gets auto redirected to the default server tab instead of the mp4upload tab, trying to find a fix for this but cant figure it out.
Below is the code
import re
import requests
import cloudscraper
from urllib import parse
from bs4 import BeautifulSoup
from requests_html import HTMLSession
base_url = 'https://9anime.to'
class nine_scraper:
def get_ep_links(url):
html = nine_scraper.get_html(url, True)
servers = html.find('div', id='servers-container')
if servers:
results = []
mp4upload_results = []
mp4upload = servers.find('div', attrs={'data-id': '35'})
mp4upload_eps = mp4upload.find_all('a', href=True)
for ep in mp4upload_eps:
x = (ep.get('href'), ep.text)
mp4upload_results.append(x)
for result in mp4upload_results:
results.append(base_url + result[0])
return results
else:
print('No servers found!!')
def get_series_info(url):
return
def get_servers(html):
return
def find_download(url):
html = nine_scraper.get_html(url, True)
def search(query):
if '&page=' in query:
query = query.split('&page=')
search_url = base_url + '/search?keyword=' + parse.quote(query[0]) + '&page=' + query[1]
else:
search_url = base_url + '/search?keyword=' + parse.quote(query)
html = nine_scraper.get_html(search_url, False)
film_list = html.find('div', class_='film-list')
if film_list:
results = []
prev_page = html.find('a', class_='pull-left')
next_page = html.find('a', class_='pull-right')
films = film_list.find_all('div', class_='inner')
for film in films:
results.append((film.find('a', class_='name').text.strip(), film.find('a', class_='name').get('href').strip()))
if prev_page.get('href'):
param = parse.urlsplit(base_url + '/' + prev_page.get('href')).query
url = parse.unquote_plus(param.replace('keyword=', ''), encoding='utf-8')
results.append(('Previous page', url))
if next_page.get('href'):
param = parse.urlsplit(base_url + '/' + next_page.get('href')).query
url = parse.unquote_plus(param.replace('keyword=', ''), encoding='utf-8')
results.append(('Next page', url))
return results
else:
print('No results found!')
def get_html(url, render_js=False): # Load webpage and return its html
try:
if render_js: # Check if page needs to render javascript, if so use 'requests_html'
session = HTMLSession() # Make a GET request to your webpage, using 'Requests'
resp = session.get(url, timeout=10)
resp.raise_for_status() # Raise an exception if respones doesnt come back 200-400
resp.html.render(timeout=10) # Render the javascript
html = BeautifulSoup(resp.html.html, 'html.parser') # Parse the html data we just got with 'BeautifulSoup4'
return html # Return the parsed html
else: # Use 'cloudscraper' since we dont need to load any javascript
c_scraper = cloudscraper.create_scraper() # Make a GET request to your webpage, using 'Requests'
resp = c_scraper.get(url)
resp.raise_for_status() # Raise an exception if respones doesnt come back 200-400
html = BeautifulSoup(resp.content, 'html.parser') # Parse the html data we just got with 'BeautifulSoup4'
return html # Return the parsed html
except requests.HTTPError as e:
print(f'HTTP error occurred: {e}')
except requests.ConnectionError as e:
print(f'Connection Error occurred: {e}')
except requests.Timeout as e:
print(f'Timeout Error occurred: {e}')
except requests.RequestException as e:
print(f'General Error occurred: {e}')
except Exception as e:
print(f'Other error occurred: {e}')
except KeyboardInterrupt:
print("Someone closed the program")
import sys
from os import system, name
from scrapers import nine_scraper
def screen_clear():
# for mac and linux(os.name is 'posix')
if name == 'nt':
_ = system('cls')
else:
_ = system('clear')
def main_menu():
while True:
screen_clear()
print('------9anime downloader------\n[1] Search \n[2] Download \n[3] Exit\n-----------------------------\n')
main_choice = input('Enter your choice [1-3] >')
if main_choice == '1':
search_menu()
break
elif main_choice == '2':
continue
elif main_choice == '3':
screen_clear()
sys.exit()
else:
continue
def search_menu(query=False):
screen_clear()
print('--------------9anime downloader/search--------------\n')
if query:
search_results = nine_scraper.search(query)
results_menu(search_results)
else:
query = input('Please enter the name of the anime >')
if query:
search_results = nine_scraper.search(query)
results_menu(search_results)
def results_menu(results):
for num, result in enumerate(results, 1):
title = result[0]
link = result[1]
if 'Previous page' not in title:
if 'Next page' in title:
n = True
print('[N] ' + title)
else:
print(f'[{num}] {title}')
else:
p = True
print('[P] ' + title)
print('[M] Main menu')
titles, links = map(list, zip(*results))
while True:
search_choice = input('Enter choice >')
try:
search_choice = int(search_choice)
if 1 <= search_choice <= len(results) + 1:
print(links[search_choice - 1])
print(titles[search_choice - 1])
ep_links = nine_scraper.get_ep_links(links[search_choice - 1])
for link in ep_links:
print(link)
nine_scraper.find_download(link)
# series_menu(links[search_choice - 1])
break
except ValueError:
if search_choice.lower() == 'm':
main_menu()
break
elif search_choice.lower() == 'p':
if p:
url = links[-2]
search_menu(url)
break
continue
elif search_choice.lower() == 'n':
if n:
url = links.pop()
search_menu(url)
break
continue
def series_menu(url):
info = nine_scraper.get_series_info()
main_menu()
I know it has to be some javascript that is redirecting the page but i cant figure out what i need to do in order to stop that, any help would be very appreciated!

Using requests_html you can set allow_redirects=False like this:
r = session.get(url,allow_redirects=False)
Now your request should go only to the requested URL.

is there anyone who knows how to fix this error

i'm trying to run this code for my project but its given an error at the try statement i.e "unindent does not match any outer indentation level"
from apiclient.discovery import build
from apiclient.errors import HttpError
from oauth2client.tools import argparser
DEVELOPER_KEY = "Replaced_my_API_key"
YOUTUBE_API_SERVICE_NAME = "youtube"
YOUTUBE_API_VERSION = "v3"
def youtube_search(options):
youtube = build(YOUTUBE_API_SERVICE_NAME, YOUTUBE_API_VERSION,
developerKey=DEVELOPER_KEY)
search_response = youtube.search().list(
q=options.q,
part="id,snippet",
maxResults=options.max_results
).execute()
videos = []
channels = []
playlists = []
for search_result in search_response.get("items", []):
if search_result["id"]["kind"] == "youtube#video":
videos.append("%s (%s)" % (search_result["snippet"]["title"],
search_result["id"]["videoId"]))
elif search_result["id"]["kind"] == "youtube#channel":
channels.append("%s (%s)" % (search_result["snippet"]["title"],
search_result["id"]["channelId"]))
elif search_result["id"]["kind"] == "youtube#playlist":
playlists.append("%s (%s)" % (search_result["snippet"]["title"],
search_result["id"]["playlistId"]))
print ("Videos:\n", "\n".join(videos), "\n")
print ("Channels:\n", "\n".join(channels), "\n")
print ("Playlists:\n", "\n".join(playlists), "\n")
if __name__ == "__main__":
to_search = "Witcher 3"
argparser.add_argument("--q", help="Search term", default=to_search)
argparser.add_argument("--max-results", help="Max results",
default=25)
args = argparser.parse_args()
**From the following statement the code is giving an error**
i.e "unindent error"
try:
youtube_search(args)
except HttpError, e:
print "An HTTP error %d occurred:\n%s" % (e.resp.status, e.content)

This error is common if you are mixing idendation with spaces and tabs. Choose one and stick to it. You can try two things:
Colorize the whitespace in your editor. In vim you can do this with :set list. Find the offending line and correct it.
Give Python the option -tt in the shebang: #!/usr/bin/python -tt. This gives you extra warnings when you are mixing identation in the same file.

function1 from other file fail when that function1 is calling another function2 inside function1

Code in FileB.py works fine, but fail at one point when I am calling it from other file. I found that it's stops working when calling function "search_response" in the code below.
FileA.py
from FileB import *
search = "stackoverflow"
searchF(search)
FileB.py
from apiclient.discovery import build
from apiclient.errors import HttpError
from oauth2client.tools import argparser
search = "Google"
def searchF(search):
DEVELOPER_KEY = "REPLACE_ME"
YOUTUBE_API_SERVICE_NAME = "youtube"
YOUTUBE_API_VERSION = "v3"
print "searchF started" - works
def youtube_search(options):
youtube = build(YOUTUBE_API_SERVICE_NAME, YOUTUBE_API_VERSION,
developerKey=DEVELOPER_KEY)
search_response = youtube.search().list(
q=options.q,
type="video",
part="id,snippet",
maxResults=options.max_results
).execute()
print "search_response executed" doesn't work
search_videos = []
for search_result in search_response.get("items", []):
search_videos.append(search_result["id"]["videoId"])
video_ids = ",".join(search_videos)
video_response = youtube.videos().list(
id=video_ids,
part='snippet, contentDetails'
).execute()
videos = []
for video_result in video_response.get("items", []):
videos.append("%s, (%s,%s)" % (video_result["snippet"]["title"],
video_result["contentDetails"],
video_result["contentDetails"]))
find = "licensedContent': True"
result = ', '.join(videos)
print find in result
if __name__ == "__main__":
print "__main__"
argparser.add_argument("--q", help="Search term", default=search)
argparser.add_argument("--max-results", help="Max results", default=25)
args = argparser.parse_args()
try:
youtube_search(args)
except HttpError, e:
print "An HTTP error %d occurred:\n%s" % (e.resp.status, e.content)

I changed if __name__ == "__main__": to if 1: and it's kinda works. But I am assume it's a horrible solution.

NameError: name 'result' is not defined. Can't figure out how to get the results to the next line

I'm a complete beginner in Python, trying to get a script to work, but I'm a little at loss on where it goes wrong. From reading other posts it seems result hasn't been mentioned before and it doesn't know how to deal with the results.
I'm running Python 2.7.11 on EL Capitan
python ytc.py -v YQHsXMglC9A [*] Retrieving video ID: YQHsXMglC9A [*]
Thumbnails retrieved. Now submitting to TinEye. [*] Searching TinEye
for: https://i.ytimg.com/vi/YQHsXMglC9A/default.jpg Traceback (most
recent call last): File "ytc.py", line 72, in <module>
if result.total_results: NameError: name 'result' is not defined
This is the script:
import argparse
import requests
import json
from pytineye import TinEyeAPIRequest
tineye = TinEyeAPIRequest('http://api.tineye.com/rest/','PUBLICKEY','PRIVATEKEY')
youtube_key = "MY-API"
ap = argparse.ArgumentParser()
ap.add_argument("-v","--videoID", required=True,help="The videoID of the YouTube video. For example: https://www.youtube.com/watch?v=VIDEOID")
args = vars(ap.parse_args())
video_id = args['videoID']
#
# Retrieve the video details based on videoID
#
def youtube_video_details(video_id):
api_url = "https://www.googleapis.com/youtube/v3/videos?part=snippet%2CrecordingDetails&"
api_url += "id=%s&" % video_id
api_url += "key=%s" % youtube_key
response = requests.get(api_url)
if response.status_code == 200:
results = json.loads(response.content)
return results
return None
print "[*] Retrieving video ID: %s" % video_id
video_data = youtube_video_details(video_id)
thumbnails = video_data['items'][0]['snippet']['thumbnails']
print "[*] Thumbnails retrieved. Now submitting to TinEye."
url_list = []
# add the thumbnails from the API to the list
for thumbnail in thumbnails:
url_list.append(thumbnails[thumbnail]['url'])
# build the manual URLS
for count in range(4):
url = "http://img.youtube.com/vi/%s/%d.jpg" % (video_id,count)
url_list.append(url)
results = []
# now walk over the list of URLs and search TinEye
for url in url_list:
print "[*] Searching TinEye for: %s" % url
try:
result = tineye.search_url(url)
except:
pass
if result.total_results:
results.extend(result.matches)
result_urls = []
dates = {}
for match in results:
for link in match.backlinks:
if link.url not in result_urls:
result_urls.append(link.url)
dates[link.crawl_date] = link.url
print
print "[*] Discovered %d unique URLs with image matches." % len(result_urls)
for url in result_urls:
print url
oldest_date = sorted(dates.keys())
print
print "[*] Oldest match was crawled on %s at %s" % (str(oldest_date[0]),dates[oldest_date[0]])

If the try-except fails, it will execute the except block which has only pass, no assignment of variable result, so if that's the case, on if result.total_results, you are referencing an object which does not exist.
This should be a quick fix
try:
result = tineye.search_url(url)
except NameError:
print 'Nothing Found !'
break
if result.total_results:
results.extend(result.matches)

The error is clear, result variable is used while its not defined. this happen in the case of failure in your try except instruction.
Fix it by moving the instruction into the try block :
for url in url_list:
print "[*] Searching TinEye for: %s" % url
try:
result = tineye.search_url(url)
if result.total_results:
results.extend(result.matches)
except:
pass

Python KeyError exception / exiting code without error

I got a rather weird problem. The following code in scrape1 sometimes works as it should, but most of the time it just stops at line 24, where request.get is being used. I do however consistently get this keyerror exception:
Exception KeyError: KeyError(140186412830800,) in module <'threading' from '/usr/lib/python2.7/threading.pyc'> ignored
The exception is only thrown when I'm importing the module proxyfetch.py but as long as I don't actually execute the code in proxyfetch.py, scrape1.py doesn't break (exception is thrown after nominal execution). Proxyfetch is based on DanMcInerney elite-proxy-finder on github. I just edited it so I could use it as a module which returns a list of proxys instead of printing them.
So here are the 2 scripts:
scrape1.py:
#scrape1.py
from bs4 import BeautifulSoup
from proxyfetch import getprox
import requests
proxcount=3
listz = getprox(proxcount)
proxfile = open("proxysave.txt", "w")
base_url = "http://google.com"
def pagefetch(url):
print "Test"
http_proxy = "http://"+listz[0]
#http_proxy = "http://103.25.203.227:3127"
print "Test2"
proxydict = {
"http" : http_proxy
#"https_proxy" : https_proxy
}
print "Test3"
page = requests.get(url, proxies=proxydict) #with proxy
#page = requests.get(url) #without proxy
print "Test4"
return page
page = pagefetch(base_url)
soup = BeautifulSoup(page.text)
links = soup.find_all("a")
if links:
for n in links:
print n
else:
print "I got nuthin."
And proxyfetch.py
#!/usr/bin/env python2
#proxyfetch.py
'''Finds hundreds of elite anonymity (L1) HTTP proxies then tests them all in parallel printing the fastest ones first.
Checks headers to confirm eliteness, checks if compatible with opening HTTPS sites, and confirms the proxy is working
through multiple IP checking sites'''
# TO DO:
# -Add http://free-proxy-list.net/
# -Add hidemyass
#from IPython import embed
__author__ = 'Dan McInerney'
__contact__ = 'danhmcinerney gmail'
from gevent import monkey
monkey.patch_all()
import requests
import ast
import gevent
import sys, re, time, os, argparse
import socket
from bs4 import BeautifulSoup
listz =[]
def getprox(amount):
argz = [amount, False, True]
try:
P = find_http_proxy(argz)
P.run()
except BaseException,Err:
return listz
return listz
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument('-s', '--show', help='Show this number of results. Example: "-s 5" will show the 5 fastest proxies then stop')
parser.add_argument('-a', '--all', help='Show all proxy results including the ones that failed 1 of the 3 tests', action='store_true')
parser.add_argument('-q', '--quiet', help='Only print the IP:port of the fastest proxies that pass all the tests', action='store_true')
return parser.parse_args()
class find_http_proxy():
''' Will only gather L1 (elite anonymity) proxies
which should not give out your IP or advertise
that you are using a proxy at all '''
#argz = [arg1, False, True]
def __init__(self, argz):
self.proxy_list = []
self.headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.154 Safari/537.36'}
self.show_num = argz[0]
self.show_all = False
self.quiet = True
self.errors = []
self.print_counter = 0
self.externalip = self.external_ip()
def external_ip(self):
req = requests.get('http://myip.dnsdynamic.org/', headers=self.headers)
ip = req.text
return ip
def run(self):
''' Gets raw high anonymity (L1) proxy data then calls make_proxy_list()
Currently parses data from gatherproxy.com and letushide.com '''
if not self.quiet:
print '[*] Your accurate external IP: %s' % self.externalip
letushide_list = self.letushide_req()
if not self.quiet:
print '[*] letushide.com: %s proxies' % str(len(letushide_list))
# Has a login now :(
gatherproxy_list = self.gatherproxy_req()
if not self.quiet:
print '[*] gatherproxy.com: %s proxies' % str(len(gatherproxy_list))
checkerproxy_list = self.checkerproxy_req()
if not self.quiet:
print '[*] checkerproxy.net: %s proxies' % str(len(checkerproxy_list))
self.proxy_list.append(letushide_list)
self.proxy_list.append(gatherproxy_list)
self.proxy_list.append(checkerproxy_list)
# Flatten list of lists (1 master list containing 1 list of ips per proxy website)
self.proxy_list = [ips for proxy_site in self.proxy_list for ips in proxy_site]
self.proxy_list = list(set(self.proxy_list)) # Remove duplicates
if not self.quiet:
print '[*] %d unique high anonymity proxies found' % len(self.proxy_list)
print '[*] Testing proxy speeds ...'
print ''
print ' Proxy | CC | Domain | Time/Errors'
self.proxy_checker()
return list_
def checkerproxy_req(self):
''' Make the request to checkerproxy and create a master list from that site '''
cp_ips = []
try:
url = 'http://checkerproxy.net/all_proxy'
r = requests.get(url, headers=self.headers)
html = r.text
except Exception:
print '[!] Failed to get reply from %s' % url
checkerproxy_list = []
return checkerproxy_list
checkerproxy_list = self.parse_checkerproxy(html)
return checkerproxy_list
def parse_checkerproxy(self, html):
''' Only get elite proxies from checkerproxy '''
ips = []
soup = BeautifulSoup(html)
for tr in soup.findAll('tr'):
if len(tr) == 19:
ip_found = False
elite = False
ip_port = None
tds = tr.findAll('td')
for td in tds:
if ':' in td.text:
ip_found = True
ip_port_re = re.match('(\d{1,3}\.){3}\d{1,3}:\d{1,5}', td.text)
if ip_port_re:
ip_port = ip_port_re.group()
if not ip_port:
ip_found = False
if 'Elite' in td.text:
elite = True
if ip_found == True and elite == True:
ips.append(str(ip_port))
break
return ips
def letushide_req(self):
''' Make the request to the proxy site and create a master list from that site '''
letushide_ips = []
for i in xrange(1,20): # can search maximum of 20 pages
try:
url = 'http://letushide.com/filter/http,hap,all/%s/list_of_free_HTTP_High_Anonymity_proxy_servers' % str(i)
r = requests.get(url, headers=self.headers)
html = r.text
ips = self.parse_letushide(html)
# Check html for a link to the next page
if '/filter/http,hap,all/%s/list_of_free_HTTP_High_Anonymity_proxy_servers' % str(i+1) in html:
pass
else:
letushide_ips.append(ips)
break
letushide_ips.append(ips)
except:
print '[!] Failed get reply from %s' % url
break
# Flatten list of lists (1 list containing 1 list of ips for each page)
letushide_list = [item for sublist in letushide_ips for item in sublist]
return letushide_list
def parse_letushide(self, html):
''' Parse out list of IP:port strings from the html '''
# \d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3} - matches IP addresses
# </a></td><td> - is in between the IP and the port
# .*?< - match all text (.) for as many characters as possible (*) but don't be greedy (?) and stop at the next greater than (<)
raw_ips = re.findall('\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}</a></td><td>.*?<', html)
ips = []
for ip in raw_ips:
ip = ip.replace('</a></td><td>', ':')
ip = ip.strip('<')
ips.append(ip)
return ips
def gatherproxy_req(self):
url = 'http://gatherproxy.com/proxylist/anonymity/?t=Elite'
try:
r = requests.get(url, headers = self.headers)
lines = r.text.splitlines()
except:
print '[!] Failed get reply from %s' % url
gatherproxy_list = []
return gatherproxy_list
gatherproxy_list = self.parse_gp(lines)
return gatherproxy_list
def parse_gp(self, lines):
''' Parse the raw scraped data '''
gatherproxy_list = []
for l in lines:
if 'proxy_ip' in l.lower():
l = l.replace('gp.insertPrx(', '')
l = l.replace(');', '')
l = l.replace('null', 'None')
l = l.strip()
l = ast.literal_eval(l)
proxy = '%s:%s' % (l["PROXY_IP"], l["PROXY_PORT"])
gatherproxy_list.append(proxy)
#ctry = l["PROXY_COUNTRY"]
return gatherproxy_list
def proxy_checker(self):
''' Concurrency stuff here '''
jobs = [gevent.spawn(self.proxy_checker_req, proxy) for proxy in self.proxy_list]
try:
gevent.joinall(jobs)
except KeyboardInterrupt:
sys.exit('[-] Ctrl-C caught, exiting')
def proxy_checker_req(self, proxy):
''' See how long each proxy takes to open each URL '''
proxyip = str(proxy.split(':', 1)[0])
# A lot of proxy checker sites give a different final octet for some reason
#proxy_split = proxyip.split('.')
#first_3_octets = '.'.join(proxy_split[:3])+'.'
results = []
urls = ['http://danmcinerney.org/ip.php', 'http://myip.dnsdynamic.org', 'https://www.astrill.com/what-is-my-ip-address.php', 'http://danmcinerney.org/headers.php']
for url in urls:
try:
check = requests.get(url,
headers = self.headers,
proxies = {'http':'http://'+proxy,
'https':'http://'+proxy},
timeout = 15)
time_or_error = str(check.elapsed)
html = check.text
time_or_error = self.html_handler(time_or_error, html, url)
url = self.url_shortener(url)
results.append((time_or_error, proxy, url))
except Exception as e:
time_or_error = self.error_handler(str(e))
url = self.url_shortener(url)
results.append((time_or_error, proxy, url))
self.print_handler(results, proxyip)
def html_handler(self, time_or_error, html, url):
''' Check the html for errors and if none are found return time to load page '''
html_lines = html.splitlines()
leng = len(html_lines)
ipre = '(?:[0-9]{1,3}\.){3}[0-9]{1,3}'
# Both of these urls just return the ip and nothing else
if url in ['http://danmcinerney.org/ip.php', 'http://myip.dnsdynamic.org']:
if leng == 1: # Should return 1 line of html
match = re.match(ipre, html)
if match:
if self.externalip in html:
time_or_error = 'Err: Page loaded; proxy failed'
else:
time_or_error = 'Err: Page loaded; proxy failed'
else:
time_or_error = 'Err: Page loaded; proxy failed'
return time_or_error
# This is the SSL page
if 'astrill' in url:
soup = BeautifulSoup(html)
ip = soup.find("td", { "colspan": 2 }).text # the ip is the only on with colspan = 2
match = re.match(ipre, ip)
if match:
if self.externalip in ip:
time_or_error = 'Err: Page loaded; proxy failed'
else:
time_or_error = 'Err: Page loaded; proxy failed'
return time_or_error
if '/headers' in url:
# check for proxy headers
proxy_headers = ['via: ', 'forwarded: ', 'x-forwarded-for', 'client-ip']
if leng > 15: # 15 is arbitrary, I just don't think you'll ever see more than 15 headers
time_or_error = 'Err: headers not returned'
return time_or_error
for l in html_lines:
for h in proxy_headers:
if h in l.lower():
time_or_error = 'Err: Proxy headers found'
return time_or_error
time_or_error = 'Passed: elite proxy'
return time_or_error
def print_handler(self, results, proxyip):
if self.show_all:
country_code = self.get_country_code(proxyip)
self.printer(results, country_code)
self.print_counter += 1
else:
passed_all = self.passed_all_tests(results)
if passed_all:
country_code = self.get_country_code(proxyip)
self.printer(results, country_code)
self.print_counter += 1
if self.show_num:
self.limiter()
def printer(self, results, country_code):
''' Creates the output '''
counter = 0
if not self.quiet:
print '--------------------------------------------------------------------'
for r in results:
counter += 1
time_or_error = r[0]
proxy = r[1]
url = r[2]
if self.quiet:
if counter % 4 == 0: #################### THIS results is a list of 4 tuples each, so proxies will repeat 4 times
#print proxy
global listz
listz.append(proxy)
else:
# Only print the proxy once, on the second print job
if counter == 1:
print '%s | %s | %s | %s' % (proxy.ljust(21), country_code.ljust(3), url.ljust(21), time_or_error)
else:
print '%s | %s | %s | %s' % (' '.ljust(21), ' ', url.ljust(21), time_or_error)
def get_country_code(self, proxyip):
''' Get the 3 letter country code of the proxy using geoiptool.com
Would use the geoip library, but it requires a local DB and what
is the point of that hassle other than marginal speed improvement '''
cc_line_found = False
cc = 'N/A'
try:
r = requests.get('http://www.geoiptool.com/en/?IP=%s' % proxyip, headers=self.headers)
html = r.text
html_lines = html.splitlines()
for l in html_lines:
if cc_line_found == True:
cc = l.split('(', 1)[1].split(')', 1)[0]
break
if 'country code:' in l.lower():
cc_line_found = True
except:
pass
return cc
def error_handler(self, e):
if 'Cannot connect' in e:
time_or_error = 'Err: Cannot connect to proxy'
elif 'timed out' in e.lower():
time_or_error = 'Err: Timed out'
elif 'retries exceeded' in e:
time_or_error = 'Err: Max retries exceeded'
elif 'Connection reset by peer' in e:
time_or_error = 'Err: Connection reset by peer'
elif 'readline() takes exactly 1 argument (2 given)' in e:
time_or_error = 'Err: SSL error'
else:
time_or_error = 'Err: ' + e
return time_or_error
def url_shortener(self, url):
if 'ip.php' in url:
url = 'danmcinerney.org'
elif 'headers.php' in url:
url = 'Header check'
elif 'dnsdynamic' in url:
url = 'dnsdynamic.org'
elif 'astrill' in url:
url = 'https://astrill.com'
return url
def passed_all_tests(self, results):
for r in results:
time_or_error= r[0]
if 'Err:' in time_or_error:
global testx
testx = 50
return False
return True
def limiter(self):
testx = 0
''' Kill the script if user supplied limit of successful proxy attempts (-s argument) is reached '''
if self.print_counter >= int(self.show_num):
sys.exit()

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Browse list in python and get data - python

Related

requests_html stop website from redirecting

is there anyone who knows how to fix this error

function1 from other file fail when that function1 is calling another function2 inside function1

NameError: name 'result' is not defined. Can't figure out how to get the results to the next line

Python KeyError exception / exiting code without error

Categories

Resources