Detect http response encoding with aiohttp - python

I'm trying to learn how to use asyncio to build an asynchronous web crawler. The following is a crude crawler to test out the framework:
import asyncio, aiohttp
from bs4 import BeautifulSoup
#asyncio.coroutine
def fetch(url):
with (yield from sem):
print(url)
response = yield from aiohttp.request('GET',url)
response = yield from response.read_and_close()
return response.decode('utf-8')
#asyncio.coroutine
def get_links(url):
page = yield from fetch(url)
soup = BeautifulSoup(page)
links = soup.find_all('a',href=True)
return [link['href'] for link in links if link['href'].find('www') != -1]
#asyncio.coroutine
def crawler(seed, depth, max_depth=3):
while True:
if depth > max_depth:
break
links = yield from get_links(seed)
depth+=1
coros = [asyncio.Task(crawler(link,depth)) for link in links]
yield from asyncio.gather(*coros)
sem = asyncio.Semaphore(5)
loop = asyncio.get_event_loop()
loop.run_until_complete(crawler("http://www.bloomberg.com",0))
Whilst asyncio seems to be documented quite well, aiohttp seems to have very little documentation so I'm struggling to work some things out myself.
Firstly, is there a way for us to detect the encoding of page response?
Secondly, can we request that the connections are kept-alive within a session? Or is this by default True like in requests?

You can look on response.headers['Content-Type'] or use chardet library for bad-formed HTTP responses. Response body is bytes string.
For keep-alive connections you should use connector like:
connector = aiohttp.TCPConnector(share_cookies=True)
response1 = yield from aiohttp.request('get', url1, connector=connector)
body1 = yield from response1.read_and_close()
response2 = aiohttp.request('get', url2, connector=connector)
body2 = yield from response2.read_and_close()

Related

how can i speed up http request using python requests

im making a simple code sending http requests over and over again to get some response that can be changed all the time, and its a little bit slow, i've tried to speed it up by using asyncio but it didnt worked with me
import requests
r = requests.Session()
url = 'https://example.com'
headers = {'user-agent: example'}
def a():
while True:
post = r.get(url, headers=headers).text
if post == 'something':
print(5)
elif post == 'something-else':
quit()
a()

ThreadPoolExecutor not Working Properly with Python Requests

When I requests a long list of urls from ThreadPoolExecutor I am encountering a problem where it seems like the request isn't being sent. I am expecting json responses from the urls
urls = ["example1.com", "example2.com", "example3.com"] #My list is about 1,500 urls long
def load_url(url):
print("here")
r = requests.get(url)
print(r.url)
print(r.json())
return r
def main():
urls = ["example1.com", "example2.com", "example3.com"] #My list is about 1,500 urls long
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
future_to_url = {executor.submit(
load_url, url): url for url in urls}
for future in concurrent.futures.as_completed(future_to_url):
print("right here")
url = future_to_url[future]
try:
response = future.result()
except (HTTPError, ConnectionError, ConnectTimeout):
print("fail")
else:
print(response.json())
main()
When I execute the following code, only the first url's request seems to have been sent and received by the as_completed function. The rest, despite printing "here" and the correct urls and the correct json response (from the load_url function), they are not sent to the as_completed function. So, it's printing "here" continuously but not "right here".
My python version is 3.9.1

problem with python requests while using proxies

I am trying to scrape a website using python requests. We can only scrape the website using proxies so I implemented the code for that. However its banning all my requests even when i am using proxies, So I used a website https://api.ipify.org/?format=json to check whether proxies working properly or not. I found it showing my original IP even while using proxies. The code is below
from concurrent.futures import ThreadPoolExecutor
import string, random
import requests
import sys
http = []
#loading http into the list
with open(sys.argv[1],"r",encoding = "utf-8") as data:
for i in data:
http.append(i[:-1])
data.close()
url = "https://api.ipify.org/?format=json"
def fetch(session, url):
for i in range(5):
proxy = {'http': 'http://'+random.choice(http)}
try:
with session.get(url,proxies = proxy, allow_redirects=False) as response:
print("Proxy : ",proxy," | Response : ",response.text)
break
except:
pass
# #timer(1, 5)
if __name__ == '__main__':
with ThreadPoolExecutor(max_workers=1) as executor:
with requests.Session() as session:
executor.map(fetch, [session] * 100, [url] * 100)
executor.shutdown(wait=True)
I tried a lot but didn't understand how my ip address is getting shown instead of the proxy ipv4. You will find output of the code here https://imgur.com/a/z02uSvi
The problem that you have set proxy for http and sending request to website which uses https. Solution is simple:
proxies = dict.fromkeys(('http', 'https', 'ftp'), 'http://' + random.choice(http))
# You can set proxy for session
session.proxies.update(proxies)
response = session.get(url)
# Or you can pass proxy as argument
response = session.get(url, proxies=proxies)

Python Multithreading no current event loop

My Threading works fine only when fetch() is not called from validate(). But in this scenario below it returns
RuntimeError: There is no current event loop in thread
in all the threads 0 - 99 what am I doing wrong here ?
from threading import Thread
import requests
from bs4 import BeautifulSoup
from requests_html import HTMLSession
def fetch():
#fetch data from another site
session = HTMLSession()
url='http://url'
data = session.get(url)
data.html.render()
content = data.html.html
soup = BeautifulSoup(content, "html.parser")
iban = soup.find('p',{"id":"demo"})
return result.text
def validate():
url = "https://url"
payload = {
"data" : fetch(),
"veto" : "fi"
}
response = requests.post(url, data=payload)
soup = BeautifulSoup(response.text, "html.parser")
data = soup.body.find(text='contact')
if(data):
print (data)
else:
print ("no data")
if __name__ == "__main__":
threads = []
for i in range(100):
# We start one thread per url present.
process = Thread(target=validate)
process.start()
threads.append(process)
From a quick search of the error, I found this Github issue that seems to show your problem and its solution.
It looks like you need to use asyncio and, at the beginning of each running thread, call asyncio.set_event_loop(asyncio.new_event_loop()).

Can't get desired results using try/except clause within scrapy

I've written a script in scrapy to make proxied requests using newly generated proxies by get_proxies() method. I used requests module to fetch the proxies in order to reuse them in the script. What I'm trying to do is parse all the movie links from it's landing page and then fetch the name of each movie from it's target page. My following script can use rotation of proxies.
I know there is an easier way to change proxies, like it is described here HttpProxyMiddleware but I would still like to stick to the way I'm trying here.
website link
This is my current attempt (It keeps using new proxies to fetch a valid response but every time it gets 503 Service Unavailable):
import scrapy
import random
import requests
from itertools import cycle
from bs4 import BeautifulSoup
from scrapy.crawler import CrawlerProcess
def get_proxies():
response = requests.get("https://www.us-proxy.org/")
soup = BeautifulSoup(response.text,"lxml")
proxy = [':'.join([item.select_one("td").text,item.select_one("td:nth-of-type(2)").text]) for item in soup.select("table.table tbody tr") if "yes" in item.text]
return proxy
class ProxySpider(scrapy.Spider):
name = "proxiedscript"
handle_httpstatus_list = [503]
proxy_vault = get_proxies()
check_url = "https://yts.am/browse-movies"
def start_requests(self):
random.shuffle(self.proxy_vault)
proxy_url = next(cycle(self.proxy_vault))
request = scrapy.Request(self.check_url,callback=self.parse,dont_filter=True)
request.meta['https_proxy'] = f'http://{proxy_url}'
yield request
def parse(self,response):
print(response.meta)
if "DDoS protection by Cloudflare" in response.css(".attribution > a::text").get():
random.shuffle(self.proxy_vault)
proxy_url = next(cycle(self.proxy_vault))
request = scrapy.Request(self.check_url,callback=self.parse,dont_filter=True)
request.meta['https_proxy'] = f'http://{proxy_url}'
yield request
else:
for item in response.css(".browse-movie-wrap a.browse-movie-title::attr(href)").getall():
nlink = response.urljoin(item)
yield scrapy.Request(nlink,callback=self.parse_details)
def parse_details(self,response):
name = response.css("#movie-info h1::text").get()
yield {"Name":name}
if __name__ == "__main__":
c = CrawlerProcess({'USER_AGENT':'Mozilla/5.0'})
c.crawl(ProxySpider)
c.start()
To make sure whether the request is being proxied, I printed response.meta and could get results like this {'https_proxy': 'http://142.93.127.126:3128', 'download_timeout': 180.0, 'download_slot': 'yts.am', 'download_latency': 0.237013578414917, 'retry_times': 2, 'depth': 0}.
As I've overused the link to check how the proxied request within scrapy works, I'm getting 503 Service Unavailable error at this moment and I can see this keyword within the response DDoS protection by Cloudflare. However, I get valid response when I try with requests module applying the same logic I implemented here.
My earlier question: why I can't get the valid response as (I suppose) I'm using proxies in the right way? [solved]
Bounty Question: how can I define try/except clause within my script so that it will try with different proxies once it throws connection error with a certain proxy?
According to scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware docs
(and source)
proxy meta key is expected to use (not https_proxy)
#request.meta['https_proxy'] = f'http://{proxy_url}'
request.meta['proxy'] = f'http://{proxy_url}'
As scrapy didn't received valid meta key - your scrapy application didn't use proxies
The start_requests() function is just the entry point. On subsequent requests, you would need to resupply this metadata to the Request object.
Also, errors can occur on two levels: proxy and target server
We need to handle bad response codes from both the proxy and the target server. Proxy errors are returned by the middelware to the errback function. The target server response can be handled during parsing from the response.status
import scrapy
import random
import requests
from itertools import cycle
from bs4 import BeautifulSoup
from scrapy.crawler import CrawlerProcess
def get_proxies():
response = requests.get("https://www.us-proxy.org/")
soup = BeautifulSoup(response.text, "lxml")
proxy = [':'.join([item.select_one("td").text, item.select_one("td:nth-of-type(2)").text]) for item in
soup.select("table.table tbody tr") if "yes" in item.text]
# proxy = ['https://52.0.0.1:8090', 'https://52.0.0.2:8090']
return proxy
def get_random_proxy(proxy_vault):
random.shuffle(proxy_vault)
proxy_url = next(cycle(proxy_vault))
return proxy_url
class ProxySpider(scrapy.Spider):
name = "proxiedscript"
handle_httpstatus_list = [503, 502, 401, 403]
check_url = "https://yts.am/browse-movies"
proxy_vault = get_proxies()
def handle_middleware_errors(self, *args, **kwargs):
# implement middleware error handling here
print('Middleware Error')
# retry request with different proxy
yield self.make_request(url=args[0].request._url, callback=args[0].request._meta['callback'])
def start_requests(self):
yield self.make_request(url=self.check_url, callback=self.parse)
def make_request(self, url, callback, dont_filter=True):
return scrapy.Request(url,
meta={'proxy': f'https://{get_random_proxy(self.proxy_vault)}', 'callback': callback},
callback=callback,
dont_filter=dont_filter,
errback=self.handle_middleware_errors)
def parse(self, response):
print(response.meta)
try:
if response.status != 200:
# implement server status code handling here - this loops forever
print(f'Status code: {response.status}')
raise
else:
for item in response.css(".browse-movie-wrap a.browse-movie-title::attr(href)").getall():
nlink = response.urljoin(item)
yield self.make_request(url=nlink, callback=self.parse_details)
except:
# if anything goes wrong fetching the lister page, try again
yield self.make_request(url=self.check_url, callback=self.parse)
def parse_details(self, response):
print(response.meta)
try:
if response.status != 200:
# implement server status code handeling here - this loops forever
print(f'Status code: {response.status}')
raise
name = response.css("#movie-info h1::text").get()
yield {"Name": name}
except:
# if anything goes wrong fetching the detail page, try again
yield self.make_request(url=response.request._url, callback=self.parse_details)
if __name__ == "__main__":
c = CrawlerProcess({'USER_AGENT': 'Mozilla/5.0'})
c.crawl(ProxySpider)
c.start()

Categories