I've written a script in scrapy to make proxied requests using newly generated proxies by get_proxies() method. I used requests module to fetch the proxies in order to reuse them in the script. What I'm trying to do is parse all the movie links from it's landing page and then fetch the name of each movie from it's target page. My following script can use rotation of proxies.
I know there is an easier way to change proxies, like it is described here HttpProxyMiddleware but I would still like to stick to the way I'm trying here.
website link
This is my current attempt (It keeps using new proxies to fetch a valid response but every time it gets 503 Service Unavailable):
import scrapy
import random
import requests
from itertools import cycle
from bs4 import BeautifulSoup
from scrapy.crawler import CrawlerProcess
def get_proxies():
response = requests.get("https://www.us-proxy.org/")
soup = BeautifulSoup(response.text,"lxml")
proxy = [':'.join([item.select_one("td").text,item.select_one("td:nth-of-type(2)").text]) for item in soup.select("table.table tbody tr") if "yes" in item.text]
return proxy
class ProxySpider(scrapy.Spider):
name = "proxiedscript"
handle_httpstatus_list = [503]
proxy_vault = get_proxies()
check_url = "https://yts.am/browse-movies"
def start_requests(self):
random.shuffle(self.proxy_vault)
proxy_url = next(cycle(self.proxy_vault))
request = scrapy.Request(self.check_url,callback=self.parse,dont_filter=True)
request.meta['https_proxy'] = f'http://{proxy_url}'
yield request
def parse(self,response):
print(response.meta)
if "DDoS protection by Cloudflare" in response.css(".attribution > a::text").get():
random.shuffle(self.proxy_vault)
proxy_url = next(cycle(self.proxy_vault))
request = scrapy.Request(self.check_url,callback=self.parse,dont_filter=True)
request.meta['https_proxy'] = f'http://{proxy_url}'
yield request
else:
for item in response.css(".browse-movie-wrap a.browse-movie-title::attr(href)").getall():
nlink = response.urljoin(item)
yield scrapy.Request(nlink,callback=self.parse_details)
def parse_details(self,response):
name = response.css("#movie-info h1::text").get()
yield {"Name":name}
if __name__ == "__main__":
c = CrawlerProcess({'USER_AGENT':'Mozilla/5.0'})
c.crawl(ProxySpider)
c.start()
To make sure whether the request is being proxied, I printed response.meta and could get results like this {'https_proxy': 'http://142.93.127.126:3128', 'download_timeout': 180.0, 'download_slot': 'yts.am', 'download_latency': 0.237013578414917, 'retry_times': 2, 'depth': 0}.
As I've overused the link to check how the proxied request within scrapy works, I'm getting 503 Service Unavailable error at this moment and I can see this keyword within the response DDoS protection by Cloudflare. However, I get valid response when I try with requests module applying the same logic I implemented here.
My earlier question: why I can't get the valid response as (I suppose) I'm using proxies in the right way? [solved]
Bounty Question: how can I define try/except clause within my script so that it will try with different proxies once it throws connection error with a certain proxy?
According to scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware docs
(and source)
proxy meta key is expected to use (not https_proxy)
#request.meta['https_proxy'] = f'http://{proxy_url}'
request.meta['proxy'] = f'http://{proxy_url}'
As scrapy didn't received valid meta key - your scrapy application didn't use proxies
The start_requests() function is just the entry point. On subsequent requests, you would need to resupply this metadata to the Request object.
Also, errors can occur on two levels: proxy and target server
We need to handle bad response codes from both the proxy and the target server. Proxy errors are returned by the middelware to the errback function. The target server response can be handled during parsing from the response.status
import scrapy
import random
import requests
from itertools import cycle
from bs4 import BeautifulSoup
from scrapy.crawler import CrawlerProcess
def get_proxies():
response = requests.get("https://www.us-proxy.org/")
soup = BeautifulSoup(response.text, "lxml")
proxy = [':'.join([item.select_one("td").text, item.select_one("td:nth-of-type(2)").text]) for item in
soup.select("table.table tbody tr") if "yes" in item.text]
# proxy = ['https://52.0.0.1:8090', 'https://52.0.0.2:8090']
return proxy
def get_random_proxy(proxy_vault):
random.shuffle(proxy_vault)
proxy_url = next(cycle(proxy_vault))
return proxy_url
class ProxySpider(scrapy.Spider):
name = "proxiedscript"
handle_httpstatus_list = [503, 502, 401, 403]
check_url = "https://yts.am/browse-movies"
proxy_vault = get_proxies()
def handle_middleware_errors(self, *args, **kwargs):
# implement middleware error handling here
print('Middleware Error')
# retry request with different proxy
yield self.make_request(url=args[0].request._url, callback=args[0].request._meta['callback'])
def start_requests(self):
yield self.make_request(url=self.check_url, callback=self.parse)
def make_request(self, url, callback, dont_filter=True):
return scrapy.Request(url,
meta={'proxy': f'https://{get_random_proxy(self.proxy_vault)}', 'callback': callback},
callback=callback,
dont_filter=dont_filter,
errback=self.handle_middleware_errors)
def parse(self, response):
print(response.meta)
try:
if response.status != 200:
# implement server status code handling here - this loops forever
print(f'Status code: {response.status}')
raise
else:
for item in response.css(".browse-movie-wrap a.browse-movie-title::attr(href)").getall():
nlink = response.urljoin(item)
yield self.make_request(url=nlink, callback=self.parse_details)
except:
# if anything goes wrong fetching the lister page, try again
yield self.make_request(url=self.check_url, callback=self.parse)
def parse_details(self, response):
print(response.meta)
try:
if response.status != 200:
# implement server status code handeling here - this loops forever
print(f'Status code: {response.status}')
raise
name = response.css("#movie-info h1::text").get()
yield {"Name": name}
except:
# if anything goes wrong fetching the detail page, try again
yield self.make_request(url=response.request._url, callback=self.parse_details)
if __name__ == "__main__":
c = CrawlerProcess({'USER_AGENT': 'Mozilla/5.0'})
c.crawl(ProxySpider)
c.start()
Related
i am trying to make a scraper for Discord to get all the members of a server, i am stuck at the login though, i can't find the csrf token anywhere in the source code for the page maybe that is why i'm getting this error since a few sources say that it is required but i'm not sure, here's my spider causing the problem
from scrapy.http import FormRequest
class RecruteSpider(scrapy.Spider):
name = "Recruteur"
def start_requests(self)
urls = [
'https://discord.com/login',
]
for url in urls:
yield scrapy.Request(url=url, callback=self.login)
def login(self, response):
url = 'https://discord.com/login'
formdata = {"username":"SecretUserName", "password":"SecretPassword"}
yield FormRequest.from_response(
response = response,
url = url,
formdata = formdata,
callback = self.afterLogin
)
def afterLogin(self, response):
print("Success!!")
#do stuff
Wen i run the program i get the error
ValueError: No element found in <200 https://discord.com/login>
Even though there clearly is a form element at that url.
I have also tried using the login url as response variable in the Form response but i get the error
AttributeError: 'str' object has no attribute 'encoding'
if you need any extra detail feel free to ask, any help is greatly appreciated, thanks in advance.
The error you are getting is because discord loads the /login page using javascript and therefore the response does not contain any form elements. You need to render the javascript using either scrapy-playwright(personal favourite), selenium or scrapy-splash.
Also your formdata variable contains invalid keys. See screenshot of the payload sent to the server in the browser.
Using scrapy-playwright, I was able to get to the callback function as below. Also note that the discord server may require you to solve a captcha once you send the login request which presents another challenge that you will need to solve.
discord.py
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy import FormRequest
class RecruteSpider(scrapy.Spider):
name = "Recruteur"
def start_requests(self):
urls = ['https://discord.com/login']
for url in urls:
yield scrapy.Request(url=url, callback=self.login, meta={"playwright": True})
def login(self, response):
url = 'https://discord.com/login'
formdata = {"login":"SecretUserName", "password":"SecretPassword"}
yield FormRequest.from_response(
response = response,
url = url,
formdata = formdata,
callback = self.afterLogin
)
def afterLogin(self, response):
print("Success!!")
#do stuff
if __name__ == "__main__":
process = CrawlerProcess(settings={
"TWISTED_REACTOR": "twisted.internet.asyncioreactor.AsyncioSelectorReactor",
"DOWNLOAD_HANDLERS": {
"https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
}, })
process.crawl(RecruteSpider)
process.start()
I am trying to scrape a website using python requests. We can only scrape the website using proxies so I implemented the code for that. However its banning all my requests even when i am using proxies, So I used a website https://api.ipify.org/?format=json to check whether proxies working properly or not. I found it showing my original IP even while using proxies. The code is below
from concurrent.futures import ThreadPoolExecutor
import string, random
import requests
import sys
http = []
#loading http into the list
with open(sys.argv[1],"r",encoding = "utf-8") as data:
for i in data:
http.append(i[:-1])
data.close()
url = "https://api.ipify.org/?format=json"
def fetch(session, url):
for i in range(5):
proxy = {'http': 'http://'+random.choice(http)}
try:
with session.get(url,proxies = proxy, allow_redirects=False) as response:
print("Proxy : ",proxy," | Response : ",response.text)
break
except:
pass
# #timer(1, 5)
if __name__ == '__main__':
with ThreadPoolExecutor(max_workers=1) as executor:
with requests.Session() as session:
executor.map(fetch, [session] * 100, [url] * 100)
executor.shutdown(wait=True)
I tried a lot but didn't understand how my ip address is getting shown instead of the proxy ipv4. You will find output of the code here https://imgur.com/a/z02uSvi
The problem that you have set proxy for http and sending request to website which uses https. Solution is simple:
proxies = dict.fromkeys(('http', 'https', 'ftp'), 'http://' + random.choice(http))
# You can set proxy for session
session.proxies.update(proxies)
response = session.get(url)
# Or you can pass proxy as argument
response = session.get(url, proxies=proxies)
I've written a script in python using proxies to scrape the links of different posts traversing different pages of a webpage. I've tried to make use of proxies from a list. The script is supposed to take random proxies from the list and send request to that website and finally parse the items. However, if any proxy is not working then it should be kicked out from the list.
I thought the way I've used number of proxies and list of urls within ThreadPool(10).starmap(make_requests, zip(proxyVault,lead_url)) is accurate but it doesn't produce any results; rather, the script gets stuck.
How can I pass the proxies and the links to the ThreadPool in order for the script to produce results?
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from multiprocessing.pool import ThreadPool
from itertools import cycle
import random
base_url = 'https://stackoverflow.com/questions/tagged/web-scraping'
lead_url = ["https://stackoverflow.com/questions/tagged/web-scraping?sort=newest&page={}&pagesize=15".format(page) for page in range(1,6)]
proxyVault = ['104.248.159.145:8888', '113.53.83.252:54356', '206.189.236.200:80', '218.48.229.173:808', '119.15.90.38:60622', '186.250.176.156:42575']
def make_requests(proxyVault,lead_url):
while True:
random.shuffle(proxyVault)
global pitem
pitem = cycle(proxyVault)
proxy = {'https':'http://{}'.format(next(pitem))}
try:
res = requests.get(lead_url,proxies=proxy)
soup = BeautifulSoup(res.text,"lxml")
[get_title(proxy,urljoin(base_url,item.get("href"))) for item in soup.select(".summary .question-hyperlink")]
except Exception:
try:
proxyVault.pop(0)
make_requests(proxyVault,lead_url)
except Exception:pass
def get_title(proxy,itemlink):
res = requests.get(itemlink,proxies=proxy)
soup = BeautifulSoup(res.text,"lxml")
print(soup.select_one("h1[itemprop='name'] a").text)
if __name__ == '__main__':
ThreadPool(10).starmap(make_requests, zip(proxyVault,lead_url))
Btw, the proxies used above are just placeholders.
The problems with your code was that it was creating a lot of endless loops in the thread. Also they way you handled the proxies was a bit strange to me, so I changed it.
I also think you had misunderstood how data was sent to the threads, they get one one element of the iterable, not the whole thing. So I changed some names to reflect that.
The way it works now is that each thread gets their own url from lead_url, then they choose a random proxy from the proxyVault.
They fetch the webpage and parse it and calls get_title on each of the parsed links.
If the request fails because of the proxy, that proxy is removed from the list so its not used again and make_requests is called again, which will randomly choose a new proxy from the ones that are still available.
I did not change the actual parsing, because I can't judge if it's what you want or not.
Runnable code:
https://repl.it/#zlim00/unable-to-pass-proxies-and-links-to-the-threadpool-to-get-re
from bs4 import BeautifulSoup
from multiprocessing.pool import ThreadPool
from random import choice
import requests
from urllib.parse import urljoin
base_url = 'https://stackoverflow.com/questions/tagged/web-scraping'
lead_url = [f'https://stackoverflow.com/questions/tagged/web-scraping?sort='
f'newest&page={page}&pagesize=15' for page in range(1, 6)]
proxyVault = ['36.67.57.45:53367', '5.202.150.233:42895',
'85.187.184.129:8080', '109.195.23.223:45947']
def make_requests(url):
proxy_url = choice(proxyVault)
proxy = {'https': f'http://{proxy_url}'}
try:
res = requests.get(url, proxies=proxy)
soup = BeautifulSoup(res.text, "lxml")
[get_title(proxy, urljoin(base_url, item.get("href")))
for item in soup.select(".summary .question-hyperlink")]
except requests.exceptions.ProxyError:
# Check so that the bad proxy was not removed by another thread
if proxy_url in proxyVault:
proxyVault.remove(proxy_url)
print(f'Removed bad proxy: {proxy_url}')
return make_requests(url)
def get_title(proxy, itemlink):
res = requests.get(itemlink, proxies=proxy)
soup = BeautifulSoup(res.text, "lxml")
print(soup.select_one("h1[itemprop='name'] a").text)
if __name__ == '__main__':
ThreadPool(10).map(make_requests, lead_url)
Maybe you can use another approach to get proxies like this
def get_proxy():
url = 'https://free-proxy-list.net/anonymous-proxy.html'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'lxml')
table = soup.find('table', attrs={'id': 'proxylisttable'})
table_body = table.find('tbody')
proxies = table_body.find_all('tr')
proxy_row = random.choice(proxies).find_all('td')
return proxy_row[0].text + ':' + proxy_row[1].text
I use scrapy very often to check long lists of links whether they're available or dead.
My problem is when the link is incorrectly formatted for example doesn't start with http:// or https:// the crawler crashes.
ValueError: Missing scheme in request url: http.www.gobiernoenlinea.gob.ve/noticias/viewNewsUser01.jsp?applet=1&id_noticia=41492
I read the list of links from pandas Series and check each of them. When the response is reachable I log it as "ok" otherwise as "dead".
import scrapy
import pandas as pd
from link_checker.items import LinkCheckerItem
class Checker(scrapy.Spider):
name = "link_checker"
def get_links(self):
df = pd.read_csv(r"final_07Sep2018.csv")
return df["Value"]
def start_requests(self):
urls = self.get_links()
for url in urls.iteritems():
index = {"index" : url[0]}
yield scrapy.Request(url=url[1], callback=self.get_response, errback=self.errback_httpbin, meta=index, dont_filter=True)
def get_response(self, response):
url = response.url
yield LinkCheckerItem(index=response.meta["index"], url=url, code="ok")
def errback_httpbin(self, failure):
yield LinkCheckerItem(index=failure.request.meta["index"], url=failure.request.url, code="dead")
I am still interested in spotting those incorrectly formatted urls. How can I validate them and yield "dead" for those as well?
You just check if it starts with https and http
If not, then prepend http manually.
if not LINK.startswith('http:') and not LINK.startswith('https:'):
LINK = "http://" + LINK
I'm trying to access a site and check if no links redirecting to a page within the site that are down. As there is no sitemap available, I'm using Scrapy to crawl the site and get all links on every page, but I can't get it to output a file with all the links found and their status code. The site I'm using to test the code is quotes.toscrape.com and my code is:
from scrapy.spiders import Spider
from mytest.items import MytestItem
from scrapy.http
import Request
import re
class MySpider(Spider):
name = "sample"
allowed_domains = ["quotes.toscrape.com"]
start_urls = ["http://quotes.toscrape.com"]
def parse(self, response):
links = response.xpath('//a/#href').extract()
\# We stored already crawled links in this list
crawledLinks = []
for link in links:
\# If it is a proper link and is not checked yet, yield it to the Spider
if link not in crawledLinks:
link = "http://quotes.toscrape.com" + link
crawledLinks.append(link)
yield Request(link, self.parse)
I've tried adding the following lines after yield:
item = MytestItem()
item['url'] = link
item['status'] = response.status
yield item
But it gets me a bunch of duplicates and no url with status 404 or 301. Does anyone know how I can get all the urls with the status?
Scrapy by default does not return any unsuccessful requests, but you can fetch them and handle them in one of your functions if you set errback on the request.
def parse(self, response):
# some code
yield Request(link, self.parse, errback=self.parse_error)
def parse_error(self, failure):
# log the response as an error
The parameter failure will contain more information on the exact reason for failure, because it could be HTTP errors (where you can fetch a response), but also DNS lookup errors and such (where there is no response).
The documentation contains an example how to use failure to determine the error reason and access Response if available:
def errback_httpbin(self, failure):
# log all failures
self.logger.error(repr(failure))
# in case you want to do something special for some errors,
# you may need the failure's type:
if failure.check(HttpError):
# these exceptions come from HttpError spider middleware
# you can get the non-200 response
response = failure.value.response
self.logger.error('HttpError on %s', response.url)
elif failure.check(DNSLookupError):
# this is the original request
request = failure.request
self.logger.error('DNSLookupError on %s', request.url)
elif failure.check(TimeoutError, TCPTimedOutError):
request = failure.request
self.logger.error('TimeoutError on %s', request.url)
You should use the HTTPERROR_ALLOW_ALL in your settings or set the meta key handle_httpstatus_all = Truein all your requests, please refer to the docs for more information.