I am trying to scrape a website using python requests. We can only scrape the website using proxies so I implemented the code for that. However its banning all my requests even when i am using proxies, So I used a website https://api.ipify.org/?format=json to check whether proxies working properly or not. I found it showing my original IP even while using proxies. The code is below
from concurrent.futures import ThreadPoolExecutor
import string, random
import requests
import sys
http = []
#loading http into the list
with open(sys.argv[1],"r",encoding = "utf-8") as data:
for i in data:
http.append(i[:-1])
data.close()
url = "https://api.ipify.org/?format=json"
def fetch(session, url):
for i in range(5):
proxy = {'http': 'http://'+random.choice(http)}
try:
with session.get(url,proxies = proxy, allow_redirects=False) as response:
print("Proxy : ",proxy," | Response : ",response.text)
break
except:
pass
# #timer(1, 5)
if __name__ == '__main__':
with ThreadPoolExecutor(max_workers=1) as executor:
with requests.Session() as session:
executor.map(fetch, [session] * 100, [url] * 100)
executor.shutdown(wait=True)
I tried a lot but didn't understand how my ip address is getting shown instead of the proxy ipv4. You will find output of the code here https://imgur.com/a/z02uSvi
The problem that you have set proxy for http and sending request to website which uses https. Solution is simple:
proxies = dict.fromkeys(('http', 'https', 'ftp'), 'http://' + random.choice(http))
# You can set proxy for session
session.proxies.update(proxies)
response = session.get(url)
# Or you can pass proxy as argument
response = session.get(url, proxies=proxies)
Related
I'm using playwright to extract data from a website and I want to use proxies which I get from this website : https://www.proxy-list.download/HTTPS. It doesn't work, and I'm wondering if this is because the proxies are free ? If this is the reason, can someone know where can i find proxies that will work ?
This is my code :
from playwright.sync_api import sync_playwright
import time
url = "https://www.momox-shop.fr/livres-romans-et-litterature-C055/"
with sync_playwright() as p:
browser = p.firefox.launch(
headless=False,
proxy= {
'server': '209.166.175.201:3128'
})
page = browser.new_page()
page.goto(url)
time.sleep(5)
Thank you !
Yes, according to your link, all proxies are "dead"
Before using proxies try checking them here is one possible solution:
import json
import requests
from pythonping import ping
from concurrent.futures import ThreadPoolExecutor
check_proxies_url = "https://httpbin.org/ip"
good_proxy = set()
# proxy_lst = requests.get("https://www.proxy-list.download/api/v1/get", params={"type": "https"})
# proxies = [proxy for proxy in proxy_lst.text.split('\r\n') if proxy]
proxy_lst = requests.get("http://proxylist.fatezero.org/proxy.list")
proxies = (f"{json.loads(data)['host']}:{json.loads(data)['port']}" for data in proxy_lst.text.split('\n') if data)
def get_proxies(proxy):
proxies = {
"https": proxy,
"http": proxy
}
try:
response = requests.get(url=check_proxies_url, proxies=proxies, timeout=2)
response.raise_for_status()
if ping(target=proxies["https"].split(':')[0], count=1, timeout=2).rtt_avg_ms < 150:
good_proxy.add(proxies["https"])
print(f"Good proxies: {proxies['https']}")
except Exception:
print(f"Bad proxy: {proxies['https']}")
with ThreadPoolExecutor() as executor:
executor.map(get_proxies, proxies)
print(good_proxy)
Get a list of active proxies with ping up to 150ms.
Output:
{'209.166.175.201:8080', '170.39.194.156:3128', '20.111.54.16:80', '20.111.54.16:8123'}
But in any case, this is a shared proxy and their performance is not guaranteed. If you want to be sure that your parser will work, then it is better to buy a proxy.
I ran your code with received proxy '170.39.194.156:3128' and for now it works
I'm using a proxy service to cycle requests with different proxy ips for web scraping. Do I need to build in functionality to end requests so as to not overload the web server I'm scraping?
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlencode
import concurrent.futures
list_of_urls = ['https://www.example']
NUM_RETRIES = 3
NUM_THREADS = 5
def scrape_url(url):
params = {'api_key': 'API_KEY', 'url': url}
# send request to scraperapi, and automatically retry failed requests
for _ in range(NUM_RETRIES):
try:
response = requests.get('http://api.scraperapi.com/', params=urlencode(params))
if response.status_code in [200, 404]:
## escape for loop if the API returns a successful response
break
except requests.exceptions.ConnectionError:
response = ''
## parse data if 200 status code (successful response)
if response.status_code == 200:
## do stuff
with concurrent.futures.ThreadPoolExecutor(max_workers=NUM_THREADS) as executor:
executor.map(scrape_url, list_of_urls)
Hi if you are using the latest version of requests, then most probably it is keeping the TCP connection alive. What you can do is to define a request class and set it up not to keep the connections alive and then proceed normally with you code
s = requests.session()
s.config['keep_alive'] = False
As discussed here, there really isn't such a thing as an HTTP connection and what httplib refers to as the HTTPConnection is really the underlying TCP connection which doesn't really know much about your requests at all. Requests abstracts that away and you won't ever see it.
The newest version of Requests does in fact keep the TCP connection alive after your request.. If you do want your TCP connections to close, you can just configure the requests to not use keep-alive.
Alternatively
s = requests.session(config={'keep_alive': False})
Updated version of your code
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlencode
import concurrent.futures
list_of_urls = ['https://www.example']
NUM_RETRIES = 3
NUM_THREADS = 5
def scrape_url(url):
params = {'api_key': 'API_KEY', 'url': url}
s = requests.session()
s.config['keep_alive'] = False
# send request to scraperapi, and automatically retry failed requests
for _ in range(NUM_RETRIES):
try:
response = s.get('http://api.scraperapi.com/', params=urlencode(params))
if response.status_code in [200, 404]:
## escape for loop if the API returns a successful response
break
except requests.exceptions.ConnectionError:
response = ''
## parse data if 200 status code (successful response)
if response.status_code == 200:
## do stuff
with concurrent.futures.ThreadPoolExecutor(max_workers=NUM_THREADS) as executor:
executor.map(scrape_url, list_of_urls)
I'm experiencing some difficulty getting requests to utilise the proxy address when requesting a website. No error is returned but by getting the script to return http://ipecho.net/plain, I can see my own IP, not that of the proxy.
import random
import requests
import time
def proxy():
proxy = (random.choice(proxies)).strip()
print("selected proxy: {0}".format(proxy))
url = 'http://ipecho.net/plain'
data = requests.get(url, proxies={"https": proxy})
print(data)
print("data returned: {0}".format(data.text))
proxies = []
with open("proxies.txt", "r") as fi:
for line in fi:
proxies.append(line)
while True:
proxy()
time.sleep(5)
The structure of the proxies.txt file is as follows:
https://95.215.111.184:3128
https://79.137.80.210:3128
Can anyone explain this behaviour?
The URL you are passing is http and you only provide an https proxy key. You need to create a key in your proxies dictionary for both http and https. These can point to the same value.
proxies = {'http': 'http://proxy.example.com', 'https': 'http://proxy.example.com'}
data = requests.get(url, proxies=proxies)
This question has been addresses in various shapes and flavors but I have not been able to apply any of the solutions I read online.
I would like to use Python to log into the site: https://app.ninchanese.com/login
and then reach the page: https://app.ninchanese.com/leaderboard/global/1
I have tried various stuff but without success...
Using POST method:
import urllib
import requests
oURL = 'https://app.ninchanese.com/login'
oCredentials = dict(email='myemail#hotmail.com', password='mypassword')
oSession = requests.session()
oResponse = oSession.post(oURL, data=oCredentials)
oResponse2 = oSession.get('https://app.ninchanese.com/leaderboard/global/1')
Using the authentication function from requests package
import requests
oSession = requests.session()
oResponse = oSession.get('https://app.ninchanese.com/login', auth=('myemail#hotmail.com', 'mypassword'))
oResponse2 = oSession.get('https://app.ninchanese.com/leaderboard/global/1')
Whenever I print oResponse2, I can see that I'm always on the login page so I am guessing the authentication did not work.
Could you please advise how to achieve this?
You have to send the csrf_token along with your login request:
import urllib
import requests
import bs4
URL = 'https://app.ninchanese.com/login'
credentials = dict(email='myemail#hotmail.com', password='mypassword')
session = requests.session()
response = session.get(URL)
html = bs4.BeautifulSoup(response.text)
credentials['csrf_token'] = html.find('input', {'name':'csrf_token'})['value']
response = session.post(URL, data=credentials)
response2 = session.get('https://app.ninchanese.com/leaderboard/global/1')
I'm using requests to make a simple web crawler, how would I go about directing all of the script's functions through a proxy so whatever website I am crawling doesn't know it is me?
To use requests to obtain response behind a proxy in script or to use urllib2 features with proxy use following snippet:
proxy_url = "https://proxy:port"
proxy_support = urllib2.ProxyHandler({'https': proxy_url})
opener = urllib2.build_opener(proxy_support)
urllib2.install_opener(opener)
url1 = "https://api_url"
req1 = urllib2.Request(url1)
print "response from API call is below"
res1 = urllib2.urlopen(req1)
response1 = res1.read()
print response1
jsonobj1 = json.loads(response1)