So I've found this code on GitHub for gathering IPs from: https://free-proxy-list.net/ and rotate them. But I get an error message when I try to run it.
I am using ChromeDriver 2.41 because I was first getting a differernt error regarding the Socks integer. Using ChromeDriver 2.41 has solved that, but I still can't get past this 'pxy' reference.
ALSO, pycharm is alerting me that 'pd' has a redeclared definition without usage. I'd really appreciate some help with the 'pxy' and 'pd' errors!
This is the code :
from selenium import webdriver
from selenium.webdriver.chrome.options import DesiredCapabilities
from selenium.webdriver.common.proxy import Proxy, ProxyType
import time
co = webdriver.ChromeOptions()
co.add_argument("log-level=3")
co.add_argument("--headless")
def get_proxies(co=co):
driver = webdriver.Chrome(chrome_options=co)
driver.get("https://free-proxy-list.net/")
PROXIES = []
proxies = driver.find_elements_by_css_selector("tr[role='row']")
for p in proxies:
result = p.text.split(" ")
if result[-1] == "yes":
PROXIES.append(result[0]+":"+result[1])
driver.close()
return PROXIES
ALL_PROXIES = get_proxies()
def proxy_driver(PROXIES, co=co):
prox = Proxy()
if PROXIES:
pxy = PROXIES[-1]
else:
print("--- Proxies used up (%s)" % len(PROXIES))
PROXIES = get_proxies()
prox.proxy_type = ProxyType.MANUAL
prox.http_proxy = pxy
prox.socks_proxy = pxy
prox.ssl_proxy = pxy
capabilities = webdriver.DesiredCapabilities.CHROME
prox.add_to_capabilities(capabilities)
driver = webdriver.Chrome(chrome_options=co, desired_capabilities=capabilities)
return driver
# --- YOU ONLY NEED TO CARE FROM THIS LINE ---
# creating new driver to use proxy
pd = proxy_driver(ALL_PROXIES)
# code must be in a while loop with a try to keep trying with different proxies
running = True
while running:
try:
mycodehere()
# if statement to terminate loop if code working properly
something()
# you
except:
new = ALL_PROXIES.pop()
# reassign driver if fail to switch proxy
pd = proxy_driver(ALL_PROXIES)
print("--- Switched proxy to: %s" % new)
This is the error I get:
Traceback (most recent call last):
File "scripts2.py", line 65, in <module>
pd = proxy_driver(ALL_PROXIES)
File "scripts2.py", line 53, in proxy_driver
prox.http_proxy = pxy
UnboundLocalError: local variable 'pxy' referenced before assignment
I'm a little confused because I thought 'pxy' is assigned under if PROXIES?
Try changing this below lines
current code:
if PROXIES:
pxy = PROXIES[-1] # script will fail if this condition not met
else:
print("--- Proxies used up (%s)" % len(PROXIES))
PROXIES = get_proxies()
Updated code:
# make sure to reset pxy to either null or empty
pxy = ''
if (PROXIES is None):
#print("--- Proxies used up (%s)" % len(PROXIES))
PROXIES = get_proxies()
pxy = PROXIES[-1]
# I would check if pxy is empty or not before doing assignment
if (pxy!=''):
#Then do the logic here
Related
Below is my try to create a username availability checker with proxies, so far it works as intended
the only thing is that its slow, i tried to implement threads but no different as im not sure if im doing it right or not.
used concurrent.futures and threading libraries.
Is there a better way to code this kind of programs or are there any other suggestions?
Thanks in advance
import requests
import json
import ctypes
import colorama
from colorama import Fore
from datetime import datetime
import os
os.system("cls")
now = datetime.now()
current_time = now.strftime("%H:%M:%S")
colorama.init()
url = "https://link"
def grab_proxies():
proxylist = []
prx = open('proxy.txt','r')
prx = prx.readlines()
for proxy in prx:
proxy = proxy.rstrip("\n")
proxylist.append(proxy)
return proxylist
prlist = grab_proxies()
def grab_usernames():
userlist = []
users = open('userlist.txt','r')
users = users.readlines()
for user in users:
user = user.rstrip("\n")
userlist.append(user)
return userlist
ulist = grab_usernames()
found = 0
pc = 0
uc = 0
for i in range(0,len(prlist)):
ctypes.windll.kernel32.SetConsoleTitleW(f"[# Checker] | Counter: %s - Found: %s - Current Proxy: %s - Started at: %s" % (i, found, prlist[pc], current_time))
try:
req = requests.post(url,headers=headers, data = {"requested_username": ulist[uc], "xsrf_token": "F0kpyvjJgeBtsOk5Gl6Jvg"},proxies={'http' : prlist[pc],'https': prlist[pc]}, timeout=2)
response = req.json()
#print(response,req.status_code)
#print(response)
#print(type(response))
if(response['reference']['status_code'] == 'TAKEN'):
#rd = response['errors']['username'][0]['code']
print(f'{Fore.LIGHTBLACK_EX}[{Fore.LIGHTRED_EX}Taken{Fore.LIGHTBLACK_EX}]{Fore.LIGHTCYAN_EX} {ulist[uc]}')
#print(ulist[uc]+" Taken")
uc+=1
elif(response['reference']['status_code'] == 'OK'):
print(f'{Fore.LIGHTBLACK_EX}[{Fore.LIGHTGREEN_EX}Available{Fore.LIGHTBLACK_EX}]{Fore.LIGHTCYAN_EX} {ulist[uc]}')
#print(ulist[uc]+" Available")
f = open("found.txt","a")
f.write(ulist[uc]+"\n")
f.close()
found+=1
uc+=1
elif(response['reference']['status_code'] == 'INVALID_BEGIN'):
print(f'{Fore.LIGHTBLACK_EX}[{Fore.LIGHTRED_EX}Invalid Username{Fore.LIGHTBLACK_EX}]{Fore.LIGHTCYAN_EX} {ulist[uc]}')
uc+=1
elif(response['reference']['status_code'] == 'DELETED'):
print(f'{Fore.LIGHTBLACK_EX}[{Fore.LIGHTRED_EX}Deleted{Fore.LIGHTBLACK_EX}]{Fore.LIGHTCYAN_EX} {ulist[uc]}')
uc+=1
else:
print(response)
except:
#print(prlist[pc]+ " Going to next proxy")
pc+=1
pass
#break
x = input("Finished!.. press enter to exit")
You could use https://github.com/encode/requests-async to do your requests in an async way
Lets say I have a website that I want to scrape. Ex. cheapoair.com
I want to use a normal requests in python to scrape the data on the first, hypothetical page. If I end up being blocked by the server, I want to switch to a proxy. I have a list of proxy servers and a method, and I also have a list of user agent strings. However, I think I need help thinking through the problem.
For reference
uagen() will return a user agent string
proxit() will return a proxy
Here is what I have so far:
import requests
from proxy_def import *
from http import cookiejar
import time
from socket import error as SocketError
import sys
start_time = time.time()
class BlockAll(cookiejar.CookiePolicy):
return_ok = set_ok = domain_return_ok = path_return_ok = lambda self, *args, **kwargs: False
netscape = True
rfc2965 = hide_cookie2 = False
headers = {'User-Agent': uagen()}
print(headers)
s = requests.Session()
s.cookies.set_policy(BlockAll)
cookies = {'SetCurrency': 'USD'}
sp = proxit()
for i in range(100000000000):
while True:
try:
print('trying on ', sp)
print('with user agent headers', headers)
s.proxies = {"http": sp}
r = s.get("http://www.cheapoair.com", headers=headers, timeout=15, cookies=cookies)
print(i, sp, 'success')
print("--- %s seconds ---" % (time.time() - start_time))
except SocketError as e:
print('passing ', sp)
sp = proxit()
headers = {'User-Agent': uagen()}
print('this is the new proxy ', sp)
print('this is the new headers ', headers)
continue
except requests.ConnectionError as e:
print('passing ', sp)
sp = proxit()
headers = {'User-Agent': uagen()}
print('this is the new proxy ', sp)
print('this is the new headers ', headers)
continue
except requests.Timeout as e:
print('passing ', sp)
sp = proxit()
headers = {'User-Agent': uagen()}
print('this is the new proxy ', sp)
print('this is the new headers ', headers)
continue
except KeyboardInterrupt:
print("The program has been terminated")
sys.exit(1)
break
#print(r.text)
print('all done',
'\n')
What I am looking for is an idea of how to say, start with a normal requests (not from a proxy), and if you end up with an error (such as being rejected by the server), switch to a proxy and try again.
I can almost picture it, but cant quite see it.
I'm thinking, that if I place a variable after
for i in range(1000000000000):
But before while true: That updates the sp then it might work. Another possibility it to maybe declare s.proxies = {"http": ""} and then if I run into an error, switch to s.poxies = {"http": "proxit()"} or s.poxies = {"http": "sp"}
Thanks!
I figured it out.
while True:
try:
#do this thing
#but remove variable from here and declare it before "while True"
except SockerError as e:
#switch headers, switch user agent string
s.proxies = {"http": proxit()}
continue
That will refresh the variable after it gets an error from the server
I tried to setup a new random proxy for each run on FireFox. Itried many ways,but only this one works but can't figure how to make it random:
profile.set_preference("network.proxy.type", 1)
profile.set_preference("network.proxy.http", "Host")
profile.set_preference("network.proxy.http_port", port)
browser = webdriver.Firefox(profile)
I tried this example but not worked:
from selenium.webdriver.common.proxy import *
myProxy = "xx.xx.xx.xx:xxxx"
proxy = Proxy({
'proxyType': ProxyType.MANUAL,
'httpProxy': myProxy,
'ftpProxy': myProxy,
'sslProxy': myProxy,
'noProxy': '' # set this value as desired
})
driver = webdriver.Firefox(proxy=proxy)
driver.get("http://www.google.com")
This is the best way for me because i can use:
myProxy = random.choice(open('data.txt').readlines())
I tried to get proxies from text file this work but don't know how to randomize:
with open('IPs.txt') as proxylist:
for line in proxylist:
proxyserv, proxyport = line.split(':')
proxy= proxyserv , proxyport
And lastlly i tried:
def random_line():
line_num = 0
selected_line = ''
with open('IPs.txt') as f:
while 1:
line = f.readline()
if not line: break
line_num += 1
if random.uniform(0, line_num) < 1:
selected_line = line
return selected_line.strip()
This one get random line but can't figure out how to parse the result to
X= IP
Y= PORT
and then:
profile.set_preference("network.proxy.type", 1)
profile.set_preference("network.proxy.http", "RANDOM IP")
profile.set_preference("network.proxy.http_port", Random PORT)
browser = webdriver.Firefox(profile)
Port needs to be an integer, you may want to use:
import random
myProxy = random.choice(open('IPs.txt').readlines())
parts = myProxy.strip().split(":") # strip removes spaces and line breaks
host = parts[0]
port = int(parts[1]) # port needs to be an integer
I made a program which gets one record from Google Sheet process on it then delete it and so on. If I update Google Sheet then the program will deduct record in the next loop and process on it and then delete,
but it runs only 1 or 2 hours and then program gives an error:
What can I add in my program so my program never stops?
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
import traceback
import string
import gspread
from oauth2client.service_account import ServiceAccountCredentials
from selenium.common.exceptions import NoAlertPresentException
from selenium.common.exceptions import UnexpectedAlertPresentException
Email=raw_input('Please Enter your Email: ')
password=raw_input('Please Enter Password: ')
print("\n******Don't Interrupt the Script******")
print('#script is Runing............\n')
chrome_options = webdriver.ChromeOptions() #going to chrome options
chrome_options.add_argument("--start-maximized")
prefs = {"profile.default_content_setting_values.notifications" : 2 #turn off all notifications
,"profile.managed_default_content_settings.images": 2} #disable images
chrome_options.add_experimental_option("prefs",prefs)
driver = webdriver.Chrome(chrome_options=chrome_options) # passing paramaters to chrome
driver.get('https://accounts.google.com')
time.sleep(3)
#giving Email-------------------
email = driver.find_element_by_id('Email')
email.send_keys(Email, Keys.RETURN)
#giving password----------------
time.sleep(3)
email = driver.find_element_by_id('Passwd')
email.send_keys(password, Keys.RETURN)
#credentials + attach with googleSheet------------------------------
scope = ['https://spreadsheets.google.com/feeds']
credentials = ServiceAccountCredentials.from_json_keyfile_name('stephens-31d8490b5bd2.json', scope)
google_sheet = gspread.authorize(credentials)
workSheet = google_sheet.open("Video Access Master Sheet").worksheet("Sheet1")
while True:
#fetch Records from Rows 2 to 50 and save on list-----------------
for i in range(2,51):
li_url=[]
li_email=[]
row=workSheet.row_values(i)
for b in row:
if 'youtu' in b:
li_url.append(b)
#find record which you append on list and then delete from googleSheet--------------------
cell = workSheet.find(b)
row = cell.row
col = cell.col
workSheet.update_cell(row,col, '')
print 'Fetching Values From Row '+str(i)+'....'
elif '#' in b:
li_email.append(b)
elif b=='':
continue
else:
continue
#*********************************************************
#getting length list of li_url and apply condition on it-----------------------------------------------
length=len(li_url)
if length==0:
continue
else:
try:
#getting URLs from list and put into driver.get---------------------------------------------------------
for a in li_url:
driver.get(a)
time.sleep(3)
driver.find_element_by_css_selector('.yt-uix-button-icon.yt-uix-button-icon-info.yt-sprite').click()
time.sleep(3)
driver.find_element_by_css_selector('.yt-uix-button.yt-uix-button-size-default.yt-uix-button-default.metadata-share-button').click()
time.sleep(2)
put_email=driver.find_element_by_css_selector('.yt-uix-form-input-textarea.metadata-share-contacts')
#getting emails from email list--------------------------------------------------------------
put_email.send_keys(li_email[0])
time.sleep(2)
driver.find_element_by_css_selector('.yt-uix-button.yt-uix-button-size-default.yt-uix-button-primary.sharing-dialog-button.sharing-dialog-ok').click()
time.sleep(4)
driver.find_element_by_xpath('.//*[#id="video-header"]/div/button[2]/span').click()
time.sleep(10)
#for notifications and alters--------------------------------------------
try:
driver.switch_to.alert.accept()
except NoAlertPresentException:
pass
except UnexpectedAlertPresentException:
pass
except:
traceback.print_exc
pass
print 'Row '+str(i)+' Successfully Updated. \n'
time.sleep(120) #while loop sleep for 20minuts
This is the error I got:
Traceback (most recent call last):
File "<stdin>", line 2, in <module>
File "<string>", line 56, in parse
File "<string>", line 35, in parse
cElementTree.ParseError: no element found: line 1, column 0
For some reason cell = workSheet.find(b) fails. Could be bad data in there; without seeing the input it's anyone's guess.
Since you already know the row number, you can avoid using cell = workSheet.find(b) by simply keeping track of the columns you're searching through and finally calling workSheet.update_cell(i, col, '') after copying the data.
I am running Selenium and PhantomJS to input search terms into a website and retrieve the number of hits for each search term. I have to do this 130,000+ times, so the code has been running nicely for a day until suddenly the program broke with the following error:
Traceback (most recent call last):
File "CBBPlyNwsScrape.py", line 82, in <module>
browser = webdriver.PhantomJS()
File "/Library/Python/2.7/site-packages/selenium/webdriver/phantomjs/webdriver.py", line 50, in __init__
self.service.start()
File "/Library/Python/2.7/site-packages/selenium/webdriver/phantomjs/service.py", line 69, in start
raise WebDriverException("Can not connect to GhostDriver")
selenium.common.exceptions.WebDriverException: Message: 'Can not connect to GhostDriver'
I'm running this on Mac OSX and Python 2.7.3. I have the latests versions of Selenium and PhantomJS installed. Can anyone tell me what is going on and why GhostDriver was working fine for so long and suddenly stopped?
In the ghostdriver.log file, this is all it contains:
PhantomJS is launching GhostDriver...
[ERROR - 2013-12-01T05:14:34.491Z] GhostDriver - Main - Could not start Ghost Driver => {
"message": "Could not start Ghost Driver",
"line": 82,
"sourceId": 4445044288,
"sourceURL": ":/ghostdriver/main.js",
"stack": "Error: Could not start Ghost Driver\n at :/ghostdriver/main.js:82",
"stackArray": [
{
"sourceURL": ":/ghostdriver/main.js",
"line": 82
}
]
}
Thanks
Installing latest phantom js fixed this error, this was happening with default ubuntu 12.04 phantomjs destro
I was having the same problem. I don't know why the program has trouble calling the phantomJS webdriver, but the answer is to write a simple exception WebDriverException. This following code did the trick for me
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
from selenium.common.exceptions import NoSuchElementException, WebDriverException
import unittest, time, re, urllib2
f = open("mother.txt","r") #opens file with name of "test.txt"
l = "1"
m = "2"
n = "3"
aTuple = ( l, m, n ) # create tuple
e = int(0)
for line in f:
e += 1
try:
h = str(e)
j = line
g = open("yes4/" + h + ".txt","w") #opens file with name of "test.txt"
for item in aTuple:
driver = webdriver.PhantomJS('phantomjs')
base_url = j + item
verificationErrors = []
accept_next_alert = True
driver.get(base_url)
elem=driver.find_element_by_id("yelp_main_body")
source_code=elem.get_attribute("outerHTML").encode('utf-8').strip()
g.write(source_code)
driver.quit()
except WebDriverException:
print "e"
h = str(e)
j = line
g = open("yes4/" + h + ".txt","w") #opens file with name of "test.txt"
for item in aTuple:
driver = webdriver.PhantomJS('phantomjs')
base_url = j + item
verificationErrors = []
accept_next_alert = True
driver.get(base_url)
elem=driver.find_element_by_id("yelp_main_body")
source_code=elem.get_attribute("outerHTML").encode('utf-8').strip()
g.write(source_code)
driver.quit()
else:
print h