I keep getting this error on multiple scripts, I'm doing a lot of scraping, I have a loop that scrapes through hundreds of pages, and at some point the scripts just stops due to this error.
Here's an example of a script
Example 2:
def scrape(urls):
for url in urls:
session = HTMLSession()
resp = session.get(url)
resp.html.render()
try:
phone = resp.html.find('span.phone')[0].text
except IndexError:
phone = None
biz_name = resp.html.find('h1')[0].text
try:
biz_desc = resp.html.find('p.biz-description-text')[0].text
except IndexError:
biz_desc = None
biz_location = resp.html.find('span.title-address-text')[0].text
city = biz_location.split(',')[-1]
print(
f'phone is: {phone}\nthe business name is: {biz_name}\nthe description is: {biz_desc}\nthe city is: {city}')
import_data(biz_name, phone, biz_desc, city)
def import_data(name, phone, desc, city):
global keyword
wp_title_box = driver.find_element_by_xpath('//*[#id="title"]')
wp_title_box.send_keys(name)
time.sleep(1)
wp_desc_box = driver.find_element_by_xpath('//*[#id="content_ifr"]')
wp_desc_box.send_keys(desc)
time.sleep(1)
new_field_button = driver.find_element_by_xpath('//*[#id="newmeta-submit"]')
select_box = Select(driver.find_element_by_xpath('//*[#id="metakeyselect"]'))
select_box.select_by_value("ad_city")
wp_city_fill = driver.find_element_by_xpath('//*[#id="metavalue"]')
wp_city_fill.send_keys(city)
new_field_button.click()
time.sleep(2)
select_box.select_by_value("ad_phone")
wp_city_fill = driver.find_element_by_xpath('//*[#id="metavalue"]')
wp_city_fill.send_keys(phone)
new_field_button.click()
time.sleep(2)
select_box.select_by_value("ad_promote")
wp_city_fill = driver.find_element_by_xpath('//*[#id="metavalue"]')
wp_city_fill.send_keys('1')
new_field_button.click()
time.sleep(2)
save_btn = driver.find_element_by_xpath('//*[#id="save-post"]')
driver.execute_script("window.scrollTo(0,0);")
time.sleep(1)
save_btn.click()
time.sleep(2)
driver.find_element_by_xpath('//*[#id="menu-posts"]/ul/li[3]/a').click()
time.sleep(2)
I've added example 2, as example 1 was solved by a loop provided below.
In the second example the script should end since I'm using a for loop, once it has finished going through all of the urls and importing them, it should be done, am I missing something?
Your program never terminates. Number calls scrape, which calls number, which calls scrape, which calls number etc. If you are going to use recursion you need to have a terminating or base case.
One suggestion is using a counter to track the depth of your recursion and then increment the counter at each step until it reaches the specified depth.
I do think for what you are doing you do not need recursion at all which is expensive due to the overhead of function calls. A simple loop would be fine:
import random
import urllib3
from requests_html import HTMLSession
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
def scrape(rand_num):
session = HTMLSession()
resp = session.get("https://www.example.com/prize/?d=" + '92' + str(rand_num))
resp.html.render()
print(f'trying coupon code 92{rand_num}')
prize = resp.html.find(containing="You've won a prize")
print(prize)
if prize:
print("https://www.example.com/prize/?d=" + '92' + str(rand_num))
def number():
for i in range(99999999):
x = random.randint(00000000, 99999999)
scrape(x)
number()
Related
the situation is that sometimes a request does not load or gets stuck in Python, in case that happens or any error occurs, I would like to retry it "n" times and wait up to a maximum of 3 seconds for each one and in case the attempts are over tell me a message that f"Could not process {type_1} and {type_2}". Everything runs in parallel with concurrent.futures. Could you help me with that?
import Requests
import concurrent.futures
import json
data = [['PEN','USD'],['USD','EUR']]
def currency(element):
type_1 =element[0]
type_2 = element[1]
s = requests.Session()
url = f'https://usa.visa.com/cmsapi/fx/rates?amount=1&fee=0&utcConvertedDate=07%2F26%2F2022&exchangedate=07%2F26%2F2022&fromCurr={type_1}&toCurr={type_2}'
a = s.get(url)
response = json.loads(a)
value = response["convertedAmount"]
return value
with concurrent.futures.ProcessPoolExecutor() as executor:
results = executor.map(
currency, data)
for value in results:
print(value)
Your code is almost there. Here, I modified a few things:
from concurrent.futures import ThreadPoolExecutor
import time
import requests
def convert_currency(tup):
from_currency, to_currency = tup
url = (
"https://usa.visa.com/cmsapi/fx/rates?amount=1&fee=0"
"&utcConvertedDate=07%2F26%2F2022&exchangedate=07%2F26%2F2022&"
f"fromCurr={from_currency}&toCurr={to_currency}"
)
session = requests.Session()
for _ in range(3):
try:
response = session.get(url, timeout=3)
if response.ok:
return response.json()["convertedAmount"]
except requests.exceptions.ConnectTimeout:
time.sleep(3)
return f"Could not process {from_currency} and {to_currency}"
data = [["VND", "XYZ"], ['PEN','USD'], ["ABC", "XYZ"], ['USD','EUR'], ["USD", "XXX"]]
with ThreadPoolExecutor() as executor:
results = executor.map(convert_currency, data)
for value in results:
print(value)
Notes
I retried 3 times (see the for loop)
Use timeout= to specify the time out (in seconds)
The .ok attribute will tell if the call was successful
No need to import json as the response object can JSON decode with the .json() method
You might experiment between ThreadPoolExecutor and ProcessPoolExecutor to see which one performs better
The code below is a sample from my complete program, I tried it to make understandable.
It sends requests to a REST API. It starts with an URL and the number of pages for this specific search and tries to catch the content for each page.
Each page has several results. Each result becomes a FinalObject.
Because there are as many API requests as there are pages, I decided to use multi-threading and the concurrent.futures module.
=> It works but, as I'm new in coding and Python, I still have these 2 questions:
How to use ThreadPoolExecutor sequentially in this case,
Is there a better way to handle multi-threading in this case?
from concurrent.futures import ThreadPoolExecutor
from requests import get as re_get
def main_function(global_page_number, headers, url_request):
# create a list of pages number
pages_numbers_list = [i for i in range(global_page_number)]
# for each page, call the page_handler (MultiThreading)
with ThreadPoolExecutor(max_workers=10) as executor:
for item in pages_numbers_list:
executor.submit(
page_handler,
item,
url_request,
headers
)
def page_handler(page_number, url_request, headers):
# we change the page number in the url request
url_request = change_page(url_request, page_number)
# new request with the new url
result = re_get(url_request, headers=headers)
result = result.json()
# in the result, with found the list of dict in order to create the
# final object
final_object_creation(result['results_list'])
def change_page(url_request, new_page_number):
"to increment the value of the 'page=' attribute in the url"
current_nb_page = ''
start_nb = url_request.find("page=") + len('page=')
while 1:
if url_request[start_nb].isdigit():
current_nb_page = url_request[start_nb]
else:
break
new_url_request = url_request.replace("page=" + current_nb_page,
"page=" + str(new_page_number))
return new_url_request
def final_object_creation(results_list):
'thanks to the object from requests.get(), it builts the final object'
global current_id_decision, dict_decisions
# each item in the results lis should be an instance of the final object
for item in results_list:
# On définit l'identifiant du nouvel objet Decision
current_id_decision += 1
new_id = current_id_decision
# On crée l'objet Décision et on l'ajoute au dico des décisions
dict_decisions[new_id] = FinalObject(item)
class FinalObject:
def __init__(self, content):
self.content = content
current_id_decision = 0
dict_decisions = {}
main_function(1000, "headers", "https://api/v1.0/search?page=0&query=test")
Iam trying to write my own python script to find an account top followed followers, and it seems to work fine, however after a while or after running the script more than 1-2 times, instagram gives me a try again error, which ive searched and found its Instagram temporarily blocking my ip as i have given to many requests at once.
Does anyone know a way to get around this?
MY CODE
"""
WHAT DOES THIS SCRIPT ACTUALLY DO?:
This script enables you to scrape all your followers and then find X top followed followers.
--------------------------------------------------------------------------------------------
NOTICE:
Unfortunately it is very hard now a days to scrape social media sites, due to
timeout issues, to many pings in a set time and other request restrictions.
So this script can only be ran 1-3 times a day.
I've tried also using exciting API's but all these are either too slow, or simply
show a '428' to many requests error.
"""
import instaloader
from selenium import webdriver
import time
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.keys import Keys
from rich.console import Console
from rich.table import Column, Table
# Global vars
L = instaloader.Instaloader()
URL = "https://www.instagram.com/{}/"
usernameGlobal = None
passwordGlobal = None
console = Console()
def get_followers():
# Login
while True: # Keep running if password/username was wrong
try:
global usernameGlobal, passwordGlobal
print("\n"+"*-=-*"*5)
usernameGlobal = input("> Enter your username: ")
passwordGlobal = input("> Enter your password: ")
L.login(usernameGlobal, passwordGlobal)
print("\n"+"-"*28+"\n> Successfully Logged In!")
print("> Please leave this program running in the background")
print("> Until you see the 'FINISHED' message'"+"\n"+"-"*28)
break
except:
print("\n"+"-"*28+"\n> Wrong Username / Password"+"\n"+"-"*28)
# Obtain profile metadata
profile = instaloader.Profile.from_username(L.context, usernameGlobal)
follow_list = []
# Loop through each follower and add to list
for followee in profile.get_followers():
follow_list.append(followee.username)
return follow_list
def scrape_data(username):
driver.get(URL.format(username))
FOLLOWERS = 0
try:
try:
FOLLOWERS = driver.find_element_by_xpath('/html/body/div[1]/section/main/div/header/section/ul/li[2]/a/span').text
except: # For people who you don't follow but follow you and have private accounts
FOLLOWERS = driver.find_element_by_xpath('/html/body/div[1]/section/main/div/header/section/ul/li[2]/span/span').text
except:
print("\n"+"-"*28+"\n> Please try this script again later!"+"\n"+"-"*28)
result = ''.join([i for i in FOLLOWERS if i.isdigit()])
return int(float(result))
def driver_login():
driver.get("https://www.instagram.com")
time.sleep(3)
element = driver.find_element_by_xpath("//input[#name='username']")
element.send_keys(usernameGlobal)
element = driver.find_element_by_xpath("//input[#name='password']")
element.send_keys(passwordGlobal)
element.send_keys(Keys.RETURN)
time.sleep(3)
# -- This is for if you have two factor authentication enabled --
# element = driver.find_element_by_xpath("//input[#name='verificationCode']")
# key = input("Enter Activation key: ")
# element.send_keys(key)
# element.send_keys(Keys.RETURN)
# time.sleep(3)
def output_result(size, result):
n_input = 0
# Get user to select how many of the top followed followers they want
while True:
try:
print("\n"+"*-=-*"*10)
n_input = int(input("> How many of your top followed followers do you want to see?\n> E.g 5 for top 5.\n> "))
if n_input > size:
continue
break
except:
print("\n"+"-"*28+"\n> Invalid input. (Must be a number & less then your follower count)"+"\n"+"-"*28)
# Make the table for a clean user friendly output and print it out
table = Table(show_header=True, header_style="bold magenta")
table.add_column("Your Followers", style="dim", width=12)
table.add_column("There Follower Count")
for x in range(n_input):
table.add_row(
list(result.keys())[x-1],
list(result.values())[x-1]
)
console.print(table)
return
if __name__ == "__main__":
list_of_followers = get_followers()
# Initialize the selenium driver
driver = webdriver.Chrome(ChromeDriverManager().install())
driver_login()
result = {}
for follower in list_of_followers:
followers = scrape_data(follower)
result[follower] = followers
# Sort the dictionary by descending order
result = dict(sorted(result.items(), key=lambda x: x[1], reverse=True))
print("\n> FINISHED")
driver.quit()
output_result(len(list_of_followers), result)
exit(0)
You can potentially make unlimited requests if you use proxies. You can buy thousands of proxies from various sites and rotate them in a dictionary.
Simply add a list of proxies to your GET request and enjoy:
proxyDict = {
"http" : http_proxy,
"https" : https_proxy,
"ftp" : ftp_proxy
}
r = requests.get(url, headers=headers, proxies=proxyDict)
Also for Selenium, from this answer:
PROXY = "1.111.111.1:8080" #your proxy
chrome_options = WebDriverWait.ChromeOptions()
chrome_options.add_argument('--proxy-server=%s' % PROXY)
chrome = webdriver.Chrome(chrome_options=chrome_options)
chrome.get("instagram.com")
So I’m using requests python library to make a series of requests ie
Req1 then Req2 then Req 3
Issue is the req1 keeps repeating itself and is not going forward to req2
Any help please
Code
While true:
Try:
session = requests.session()
r = session.get('Url').text #req1
postdata = 'the post data'
myheader = {'the headers'}
n = session.post('Myurl ', data=postdata, headers=myheaders).text #req2
Request keeps repeating the get request
Your indentation could be the problem as only the indented code from the while True loop will be repeated.
This in return would cause the rest of the code to not run as the loop will never end.
Some errors I also noticed were:
After the try: there is no except:
The T in Try: is also uppercased when it should be lowercased
The T in True: should be uppercase as well
The w in While should be lowercased
An proper example would be
while True:
try:
session = requests.session()
r = session.get('https://example.com').text #req1
postdata = {'data': 'data'}
myheader = {'header': 'header'}
n = session.post('https://example.com', data=postdata, headers=myheaders).text #req2
except:
# Some logic for after a error occurs
# ex:
print("Error has occured")
Now this is just nit picking and isn't all that relevant but the point of using requests.session() is to be a faster version of typing out requests so setting it to session is a little redundant.
I'm putting together a python script to make trades on poloniex with the API, and so far I've got it to make trades when certain conditions are met, but I still need it to NOT place anymore trades for the rest of that day (I have the entire script looping every 60 seconds).
So far I have this script:
import requests
import urllib.request
import urllib.parse
import http.client
import hashlib
import hmac
import time
import json
from urllib.request import urlopen
The_Currency_Pair = input('Which Currency Pair?\nPAIRS TO CHOOSE FROM:\nUSDT_BTC\nUSDT_XRP\nUSDT_ETH\nUSDT_BCH\nUSDT_STR\nUSDT_LTC\nUSDT_ETC\nUSDT_XMR\n')
api = 'https://poloniex.com/tradingApi'
key = 'XXXXXXXXXXXXXXXXXXXXXXXXXXXXX'
secret = 'XXXXXXXXXXXXXXXXXXXXXXXXX'
def main():
poloniexPrices = urlopen('https://poloniex.com/public?command=returnTicker').read()
poloniexjson = json.loads(poloniexPrices)
poloniexlastP = poloniexjson[The_Currency_Pair]['last']
poloniexOCHL = urlopen('https://poloniex.com/public?command=returnChartData¤cyPair=USDT_BCH&start=1538352000&period=86400').read()
poloniexOCHLjson = json.loads(poloniexOCHL)
poloniexlasthigh = poloniexOCHLjson[-2]['high']
print ('Last Price')
print (poloniexlastP)
print ('----------------------------------------')
print ('Last Day High')
print (poloniexlasthigh)
print ('----------------------------------------')
data = {
'command': 'returnBalances',
'nonce' : int(time.time() * 1000)
}
data = urllib.parse.urlencode(data).encode()
signature = hmac.new(secret.encode(), data, hashlib.sha512)
headers = {
'Key' : key,
'Sign': signature.hexdigest()
}
request = urllib.request.Request(
url=api, data=data, headers=headers, method='POST'
)
text = urllib.request.urlopen(request).read().decode()
print ('MY ACCOUNT BALANCE')
try:
print(json.loads(text)['USDT'])
except:
print(text)
print ('-----------------------------------------')
if float(poloniexlastP) > 0:
print ('PLACING TRADE')
print ('-----------------------------------------------')
parms = {"command":"buy",
"currencyPair":The_Currency_Pair,
"rate":100,
"immediateOrCancel":1,
"amount":0.01,
"nonce":int(time.time() * 1000)}
parms = urllib.parse.urlencode(parms).encode()
signature = hmac.new(secret.encode(), parms, hashlib.sha512)
headers = {'Key' : key,
'Sign': signature.hexdigest()}
request = urllib.request.Request(
url=api, data=parms, headers=headers, method='POST')
text = urllib.request.urlopen(request).read().decode()
ordernumber = (json.loads(text)['orderNumber'])
print ('Order Number:')
print (ordernumber)
while True:
main()
time.sleep(60)
Anyway, after a trade has been placed, I need it to make sure that after the 60 second sleep, it doesn't make a second trade unless it is a new day/the day after the trade was made. (Could I use poloniex server time for this?)
So, if it has got as far as print (ordernumber) that means it has placed a trade. But how do I mark it as placed trade for the day or something and use it in the if float(poloniexlastP) > 0: for the next loop to make sure it doesn't place another one?
Maybe you can use Python to get the date, and create a global variable, and after the print statement, you can set the variable to the current date, and the coffee will check if it has already sent, that way it doesn't execute more than once in a day.
import datetime
# This Gets The Day Of The Month
todaysDateNumber = int(datetime.datetime.now().strftime("%d"))
dateNumberTradeSent = 0
if todaysDateNumber == dateNumberTradeSent:
print("The API has already been used once today, try again tomorrow!")
return
else:
# Call Your Trade Sending Code Here
# After The Print Statement That Shows That The Trade Was Sent:
global dateNumberTradeSent
dateNumberTradeSent = int(datetime.datetime.now().strftime("%d"))