I'm doing scrapping with scrapy + playwright of an ecommerce website, approximately in one hour returns 42k registers and broken with the message:
node:events:505
throw er; // Unhandled 'error' event
^
Error: write EPIPE
at WriteWrap.onWriteComplete [as oncomplete] (node:internal/stream_base_commons:94:16)
Emitted 'error' event on Socket instance at:
at emitErrorNT (node:internal/streams/destroy:157:8)
at emitErrorCloseNT (node:internal/streams/destroy:122:3)
at processTicksAndRejections (node:internal/process/task_queues:83:21) {
errno: -32,
code: 'EPIPE',
syscall: 'write'
}
UPDATE I check that browser memory increases a lot to a point that it crashes, it seems that the browser, scrapy or playwright do not close the open instances.
UPDATE2 I post the code because I think that could be some routine bursting the memory. If anyone has any ideas it would be most welcome.
import scrapy
from scrapy_playwright.page import PageMethod
class LojaSpider(scrapy.Spider):
name = 'loja'
start_urls = ['https://www.loja.com.br']
allowed_domains = ['loja.com.br']
def parse(self, response, **kwargs):
# Obtém os links dos departamentos principais
for link in response.xpath('//div[1][#class="nav-list-column"]//ul//li[#class="nav-item"]/a/#href'):
yield response.follow( url=link.root,
errback =self.errback,
meta = {
'playwright' : True,
'playwright_page_methods' : [PageMethod('wait_for_selector', 'div.css-1431dge-generic-carousel-generic-carousel--with-offset.e1w2odgr0')],
}, callback=self.parse_sublinks)
def parse_sublinks(self, response):
# Acessa todos os sublinks e obtém os links para a grade de produtos.
for link in response.css('div.swiper-slide'):
url = link.css('a').attrib['href']
yield response.follow( url=url,
errback =self.errback,
meta = {
'playwright' : True,
# "playwright_include_page": True,
'playwright_page_methods' : [PageMethod('wait_for_selector', 'div.css-agchm8-products--product-list.e1r1bq5m1')],
}, callback = self.parse_products)
async def parse_products(self, response):
print('Link verificado: ', response.url )
# Verifica se esta na grade de produtos para obter os itens.
if response.css('div.css-agchm8-products--product-list.e1r1bq5m1'):
for product in response.css('div.new-product-thumb'):
yield {
'description': product.css('span.css-1eaoahv-ellipsis.enof0xo0::text').get(),
'price': product.css('span.css-gcz93e-price-tag__price.ehxrgxb3::text').get(),
'installment_price': product.css('span.css-4id40w-price-tag__content-wrapper.ehxrgxb0::text').get(),
'code': product.css('span.css-19qfvzb-new-product-thumb__product-code.ecqorlx1::text').get(),
'link': product.css('a').attrib['href']
}
current_page = response.css('div.css-bslizp-badge::text').get()
next_link = response.css('i.glyph.glyph-arrow-right').get()
next_page = int(current_page) + 1 if next_link else None
if next_page:
if int(current_page) > 1:
next_page_url = response.url[:-8] + '?page=' + str(next_page).rjust(2, '0')
else:
next_page_url = response.url + '?page=' + str(next_page).rjust(2, '0')
yield response.follow(next_page_url,
errback = self.errback,
meta = {
'playwright' : True,
#"playwright_include_page": True,
'playwright_page_methods' : [PageMethod('wait_for_selector', 'div.css-agchm8-products--product-list.e1r1bq5m1')],
},
callback=self.parse_products
)
# Verifica se esta na página direta do produto e obtem os itens.
elif response.css('div.box-1'):
print('Entrou no elif do div.box-1')
yield {
'description': response.css('h1.product-title.align-left.color-text product-description::text').get(),
'price': response.css('span.css-rwb0cd-to-price__integer.e17u5sne7::text').get(),
'installment_price': response.css('p.css-1b49m6w-text-text--bold-text-color--n400-text--kilo-heading--no-margin::text').get(),
'code': response.css('div.badge.product-code.badge-product-code::text').get(),
}
else:
print('Entrou no else para obter os links')
# Caso não esteja na grade e na página do produto ele retorna para a função
# parse_sublink com a url para poder captar os links do carrocel js.
yield response.follow( url=response.url,
errback =self.errback,
meta = {
'playwright' : True,
}, callback=self.parse_sublinks)
# page = response.meta['playwright_page']
# await page.close()
async def errback(self, failure):
page = failure.request.meta["playwright_page"]
await page.close()
I searched a lot but dont find anything relevant considering I'm using scrapy(python) + playwright. Before had a error about overflow memory, but I solve with: export NODE_OPTIONS="--max-old-space-size=8192", I dont know if there's any connection.
I had the same issue and it was because of an unclosed page instance, however, I see in your code page is closed.
Please make sure you did the following steps:
Close all playwright instances
Close all BrowserContext objects
Close all opened Page objects
Opened Playwright instance looks like this:
playwright_instance = await async_playwright().start()
browser = await playwright_instance.firefox.launch(headless=headless, proxy=PROXIES[0])
context = await browser.new_context(java_script_enabled=True)
Here is my solution:
await page.close()
await browser.close()
await playwright_instance.stop()
I've created a scraper using requests module implementing rotation of proxies (taken from a free proxy site) within it to fetch content from yellowpages.
The script appears to work correctly but it is terribly slow as it takes a lot of time to find a working proxy. I've tried to reuse the same working proxy (when found) until it is dead and for that I had to declare proxies and proxy_url as global.
Although shop_name and categories are available in landing pages, I scraped both of them from inner pages so that the script can demonstrate that it uses the same working proxy (when it finds one) multiple times.
This is the script I'm trying with:
import random
import requests
from bs4 import BeautifulSoup
base = 'https://www.yellowpages.com{}'
link = 'https://www.yellowpages.com/search?search_terms=pizza&geo_location_terms=Los+Angeles%2C+CA'
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36',
}
def get_proxies():
response = requests.get('https://www.sslproxies.org/')
soup = BeautifulSoup(response.text,"lxml")
proxies = []
for item in soup.select("table.table tbody tr"):
if not item.select_one("td"):break
ip = item.select_one("td").text
port = item.select_one("td:nth-of-type(2)").text
proxies.append(f"{ip}:{port}")
return [{'https': f'http://{x}'} for x in proxies]
def fetch_resp(link,headers):
global proxies, proxy_url
while True:
print("currently being used:",proxy_url)
try:
res = requests.get(link, headers=headers, proxies=proxy_url, timeout=10)
print("status code",res.status_code)
assert res.status_code == 200
return res
except Exception as e:
proxy_url = proxies.pop(random.randrange(len(proxies)))
def fetch_links(link,headers):
res = fetch_resp(link,headers)
soup = BeautifulSoup(res.text,"lxml")
for item in soup.select(".v-card > .info a.business-name"):
yield base.format(item.get("href"))
def get_content(link,headers):
res = fetch_resp(link,headers)
soup = BeautifulSoup(res.text,"lxml")
shop_name = soup.select_one(".sales-info > h1.business-name").get_text(strip=True)
categories = ' '.join([i.text for i in soup.select(".categories > a")])
return shop_name,categories
if __name__ == '__main__':
proxies = get_proxies()
proxy_url = proxies.pop(random.randrange(len(proxies)))
for inner_link in fetch_links(link,headers):
print(get_content(inner_link,headers))
How can I quickly select a functional proxy from a list of proxies?
Please let me point out that using free proxy IP addresses can be highly problematic. These type of proxies are notorious for having connections issues, such as timeouts related to latency. Plus these sites can also be intermittent, which means that they can go down at anytime. And sometimes these sites are being abused, so they can get blocked.
With that being said, below are multiple methods that can be used to accomplish your use case related to scraping content from the Yellow Pages.
UPDATE 07-11-2022 16:47 GMT
I tried a different proxy validation method this morning. It is slightly faster than the proxy judge method. The issue with both these methods is error handling. I have to catch all the errors below when validating a proxy IP address and passing a validated address to your function fetch_resp.
ConnectionResetError
requests.exceptions.ConnectTimeout
requests.exceptions.ProxyError
requests.exceptions.ConnectionError
requests.exceptions.HTTPError
requests.exceptions.Timeout
requests.exceptions.TooManyRedirects
urllib3.exceptions.MaxRetryError
urllib3.exceptions.ProxySchemeUnknown
urllib3.exceptions.ProtocolError
Occasionally a proxy fails when extracting from a page, which causes a delay. There is nothing you can do to prevent these failures. The only thing you can do is catch the error and reprocess the request.
I was able to improve the extraction time by adding threading to function get_content.
Content Extraction Runtime: 0:00:03.475362
Total Runtime: 0:01:16.617862
The only way you can increase the speed of your code is to redesign it to query each page element at the same time. If you don't this is a timing bottleneck.
Here is the code that I used to validate the proxy addresses.
def check_proxy(proxy):
try:
session = requests.Session()
session.headers['User-Agent'] = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.131 Safari/537.36'
session.max_redirects = 300
proxy = proxy.split('\n', 1)[0]
# print('Checking ' + proxy)
req = session.get("http://google.com", proxies={'http':'http://' + proxy}, timeout=30, allow_redirects=True)
if req.status_code == 200:
return proxy
except requests.exceptions.ConnectTimeout as e:
return None
except requests.exceptions.ConnectionError as e:
return None
except ConnectionResetError as e:
# print('Error,ConnectionReset!')
return None
except requests.exceptions.HTTPError as e:
return None
except requests.exceptions.Timeout as e:
return None
except ProxySchemeUnknown as e:
return None
except ProtocolError as e:
return None
except requests.exceptions.ChunkedEncodingError as e:
return None
except requests.exceptions.TooManyRedirects as e:
return None
UPDATE 07-10-2022 23:53 GMT
I did some more research into this question. I have noted that the website https://www.sslproxies.org provides a list of 100 HTTPS. Out of those less than 20% pass the proxy judge test. Even after obtaining this 20% some will fail when being passed to your function fetch_resp. They can fail for multiple reasons, which include ConnectTimeout, MaxRetryError, ProxyError, etc. When this happens you can rerun the function with the same link (url), headers and a new proxy. The best workaround for these errors is to use a commercial proxy service.
In my latest test I was able to obtain a list of potentially functional proxies and extract all the content for all 25 pages related to your search. Below is the timeDelta for this test:
Content Extraction Runtime: 0:00:34.176803
Total Runtime: 0:01:22.429338
I can speed this up if I use threading with the function fetch_resp.
Below is the current code that I'm using. I need to improve the error handling, but it currently works.
import time
import random
import requests
from datetime import timedelta
from bs4 import BeautifulSoup
from proxy_checking import ProxyChecker
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from urllib3.exceptions import MaxRetryError, ProxySchemeUnknown
from concurrent.futures import ThreadPoolExecutor, as_completed
proxies_addresses = []
current_proxy = ''
def requests_retry_session(retries=5,
backoff_factor=0.5,
status_force_list=(500, 502, 503, 504),
session=None,
):
session = session or requests.Session()
retry = Retry(
total=retries,
read=retries,
connect=retries,
backoff_factor=backoff_factor,
status_forcelist=status_force_list,
)
adapter = HTTPAdapter(max_retries=retry)
session.mount('http://', adapter)
session.mount('https://', adapter)
return session
def ssl_proxy_addresses():
global proxies_addresses
response = requests.get('https://www.sslproxies.org/')
soup = BeautifulSoup(response.text, "lxml")
proxies = []
table = soup.find('tbody')
table_rows = table.find_all('tr')
for row in table_rows:
ip_address = row.find_all('td')[0]
port_number = row.find_all('td')[1]
proxies.append(f'{ip_address.text}:{port_number.text}')
proxies_addresses = proxies
return proxies
def proxy_verification(current_proxy_address):
checker = ProxyChecker()
proxy_judge = checker.check_proxy(current_proxy_address)
proxy_status = bool([value for key, value in proxy_judge.items() if key == 'status' and value is True])
if proxy_status is True:
return current_proxy_address
else:
return None
def get_proxy_address():
global proxies_addresses
proxy_addresses = ssl_proxy_addresses()
processes = []
with ThreadPoolExecutor(max_workers=40) as executor:
for proxy_address in proxy_addresses:
processes.append(executor.submit(proxy_verification, proxy_address))
proxies = [task.result() for task in as_completed(processes) if task.result() is not None]
proxies_addresses = proxies
return proxies_addresses
def fetch_resp(link, http_headers, proxy_url):
try:
print(F'Current Proxy: {proxy_url}')
response = requests_retry_session().get(link,
headers=http_headers,
allow_redirects=True,
verify=True,
proxies=proxy_url,
timeout=(30, 45)
)
print("status code", response.status_code)
if response.status_code == 200:
return response
else:
current_proxy = proxies_addresses.pop(random.randrange(len(proxies_addresses)))
fetch_resp(link, http_headers, current_proxy)
except requests.exceptions.ConnectTimeout as e:
print('Error,Timeout!')
current_proxy = proxies_addresses.pop(random.randrange(len(proxies_addresses)))
fetch_resp(link, http_headers, current_proxy)
pass
except requests.exceptions.ProxyError as e:
print('ProxyError!')
current_proxy = proxies_addresses.pop(random.randrange(len(proxies_addresses)))
fetch_resp(link, http_headers, current_proxy)
pass
except requests.exceptions.ConnectionError as e:
print('Connection Error')
current_proxy = proxies_addresses.pop(random.randrange(len(proxies_addresses)))
fetch_resp(link, http_headers, current_proxy)
pass
except requests.exceptions.HTTPError as e:
print('HTTP ERROR!')
current_proxy = proxies_addresses.pop(random.randrange(len(proxies_addresses)))
fetch_resp(link, http_headers, current_proxy)
pass
except requests.exceptions.Timeout as e:
print('Error! Connection Timeout!')
current_proxy = proxies_addresses.pop(random.randrange(len(proxies_addresses)))
fetch_resp(link, http_headers, current_proxy)
pass
except ProxySchemeUnknown as e:
print('ERROR unknown Proxy Scheme!')
current_proxy = proxies_addresses.pop(random.randrange(len(proxies_addresses)))
fetch_resp(link, http_headers, current_proxy)
pass
except MaxRetryError as e:
print('MaxRetryError')
current_proxy = proxies_addresses.pop(random.randrange(len(proxies_addresses)))
fetch_resp(link, http_headers, current_proxy)
pass
except requests.exceptions.TooManyRedirects as e:
print('ERROR! Too many redirects!')
current_proxy = proxies_addresses.pop(random.randrange(len(proxies_addresses)))
fetch_resp(link, http_headers, current_proxy)
pass
def get_content(http_headers, proxy_url):
start_time = time.time()
results = []
pages = int(25)
for page_number in range(1, pages):
print(page_number)
next_url = f"https://www.yellowpages.com/search?search_terms=pizza&geo_location_terms=Los%20Angeles%2C%20CA" \
f"&page={page_number}"
res = fetch_resp(next_url, http_headers, proxy_url)
soup = BeautifulSoup(res.text, "lxml")
info_sections = soup.find_all('li', {'class': 'business-card'})
for info_section in info_sections:
shop_name = info_section.find('h2', {'class': 'title business-name'})
categories = ', '.join([i.text for i in info_section.find_all('a', {'class': 'category'})])
results.append({shop_name.text, categories})
end_time = time.time() - start_time
print(f'Content Extraction Runtime: {timedelta(seconds=end_time)}')
return results
start_time = time.time()
get_proxy_address()
if len(proxies_addresses) != 0:
print(proxies_addresses)
print('\n')
current_proxy = proxies_addresses.pop(random.randrange(len(proxies_addresses)))
print(current_proxy)
print('\n')
base_url = 'https://www.yellowpages.com{}'
current_url = 'https://www.yellowpages.com/search?search_terms=pizza&geo_location_terms=Los+Angeles%2C+CA'
headers = {
'user-agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 13_3_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.5 Mobile/15E148 Safari/604.1',
}
PROXIES = {
'https': f"http://{current_proxy}"
}
results = get_content(headers, PROXIES)
end_time = time.time() - start_time
print(f'Total Runtime: {timedelta(seconds=end_time)}')
UPDATE 07-06-2022 11:02 GMT
This seems to be your core question:
How can I quickly select a functional proxy from a list of proxies?
First, all my previous code is able to validate that a proxy is working at a given moment in time. Once validated I'm able to query and extract data from your Yellow Pages search for pizza in Los Angeles.
Using my previous method I'm able to query and extract data for all 24 pages related to your search in 0:00:45.367209 seconds.
Back to your question.
The website https://www.sslproxies.org provides a list of 100 HTTPS proxies. There is zero guarantee that all 100 are currently operational. One of the ways to identify the working ones is using a Proxy Judge service.
In my previous code I continually selected a random proxy from the list of 100 and passed this proxy to a Proxy Judge for validation. Once a proxy is validated to be working it is used to query and extract data Yellow Pages.
The method above works, but I was wondering how many proxies out of the 100 pass the sniff test for the Proxy Judge service. I attempted to check using a basic for loop, which was deathly slow. I decided to concurrent.futures to speed up the validation.
The code below takes about 1 minute to obtain a list of HTTPS proxies and validate them using a Proxy Judge service.
This is the fastest way to obtain a list of free proxies that are functional at a specific moment in time.
import requests
from bs4 import BeautifulSoup
from proxy_checking import ProxyChecker
from concurrent.futures import ThreadPoolExecutor, as_completed
def ssl_proxy_addresses():
response = requests.get('https://www.sslproxies.org/')
soup = BeautifulSoup(response.text, "lxml")
proxies = []
table = soup.find('tbody')
table_rows = table.find_all('tr')
for row in table_rows:
ip_address = row.find_all('td')[0]
port_number = row.find_all('td')[1]
proxies.append(f'{ip_address.text}:{port_number.text}')
return proxies
def proxy_verification(current_proxy_address):
checker = ProxyChecker()
proxy_judge = checker.check_proxy(current_proxy_address)
proxy_status = bool([value for key, value in proxy_judge.items() if key == 'status' and value is True])
if proxy_status is True:
return current_proxy_address
else:
return None
def get_proxy_address():
proxy_addresses = ssl_proxy_addresses()
processes = []
with ThreadPoolExecutor(max_workers=20) as executor:
for proxy_address in proxy_addresses:
processes.append(executor.submit(proxy_verification, proxy_address))
proxies = [task.result() for task in as_completed(processes) if task.result() is not None]
print(len(proxies))
13
print(proxies)
['34.228.74.208:8080', '198.41.67.18:8080', '139.9.64.238:443', '216.238.72.163:59394', '64.189.24.250:3129', '62.193.108.133:1976', '210.212.227.68:3128', '47.241.165.133:443', '20.26.4.251:3128', '185.76.9.123:3128', '129.41.171.244:8000', '12.231.44.251:3128', '5.161.105.105:80']
UPDATE CODE 07-05-2022 17:07 GMT
I added a snippet of code below to query the second page. I did this to see if the proxy stayed the same, which it did. You still need to add some error handling.
In my testing I was able to query all 24 pages related to your search in 0:00:45.367209 seconds. I don't consider this query and extraction speed slow by any means.
Concerning performing a different search. I would do the same method as below, but I would request a new proxy for this search, because free proxies do have limitations, such as life time and performance degradation.
import random
import logging
import requests
import traceback
from time import sleep
from random import randint
from bs4 import BeautifulSoup
from proxy_checking import ProxyChecker
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from urllib3.exceptions import ProxySchemeUnknown
from http_request_randomizer.requests.proxy.ProxyObject import Protocol
from http_request_randomizer.requests.proxy.requestProxy import RequestProxy
current_proxy = ''
def requests_retry_session(retries=5,
backoff_factor=0.5,
status_force_list=(500, 502, 503, 504),
session=None,
):
session = session or requests.Session()
retry = Retry(
total=retries,
read=retries,
connect=retries,
backoff_factor=backoff_factor,
status_forcelist=status_force_list,
)
adapter = HTTPAdapter(max_retries=retry)
session.mount('http://', adapter)
session.mount('https://', adapter)
return session
def random_ssl_proxy_address():
try:
# Obtain a list of HTTPS proxies
# Suppress the console debugging output by setting the log level
req_proxy = RequestProxy(log_level=logging.ERROR, protocol=Protocol.HTTPS)
# Obtain a random single proxy from the list of proxy addresses
random_proxy = random.sample(req_proxy.get_proxy_list(), 1)
return random_proxy[0].get_address()
except AttributeError as e:
pass
def proxy_verification(current_proxy_address):
checker = ProxyChecker()
proxy_judge = checker.check_proxy(current_proxy_address)
proxy_status = bool([value for key, value in proxy_judge.items() if key == 'status' and value is True])
return proxy_status
def get_proxy_address():
global current_proxy
random_proxy_address = random_ssl_proxy_address()
current_proxy = random_proxy_address
proxy_status = proxy_verification(random_proxy_address)
if proxy_status is True:
return
else:
print('Looking for a valid proxy address.')
# this sleep timer is helping with some timeout issues
# that were happening when querying
sleep(randint(5, 10))
get_proxy_address()
def fetch_resp(link, http_headers, proxy_url):
try:
response = requests_retry_session().get(link,
headers=http_headers,
allow_redirects=True,
verify=True,
proxies=proxy_url,
timeout=(30, 45)
)
print(F'Current Proxy: {proxy_url}')
print("status code", response.status_code)
return response
except requests.exceptions.ConnectTimeout as e:
print('Error,Timeout!')
print(''.join(traceback.format_tb(e.__traceback__)))
except requests.exceptions.ConnectionError as e:
print('Connection Error')
print(''.join(traceback.format_tb(e.__traceback__)))
except requests.exceptions.HTTPError as e:
print('HTTP ERROR!')
print(''.join(traceback.format_tb(e.__traceback__)))
except requests.exceptions.Timeout as e:
print('Error! Connection Timeout!')
print(''.join(traceback.format_tb(e.__traceback__)))
except ProxySchemeUnknown as e:
print('ERROR unknown Proxy Scheme!')
print(''.join(traceback.format_tb(e.__traceback__)))
except requests.exceptions.TooManyRedirects as e:
print('ERROR! Too many redirects!')
print(''.join(traceback.format_tb(e.__traceback__)))
def get_next_page(raw_soup, http_headers, proxy_urls):
next_page_element = raw_soup.find('a', {'class': 'paginator-next arrow-next'})
next_url = f"https://www.yellowpages.com{next_page_element['href']}"
sub_response = fetch_resp(next_url, http_headers, proxy_urls)
new_soup = BeautifulSoup(sub_response.text, "lxml")
return new_soup
def get_content(link, http_headers, proxy_urls):
res = fetch_resp(link, http_headers, proxy_urls)
soup = BeautifulSoup(res.text, "lxml")
info_sections = soup.find_all('li', {'class': 'business-card'})
for info_section in info_sections:
shop_name = info_section.find('h2', {'class': 'title business-name'})
print(shop_name.text)
categories = ', '.join([i.text for i in info_section.find_all('a', {'class': 'category'})])
print(categories)
business_website = info_section.find('a', {'class': 'website listing-cta action'})
if business_website is not None:
print(business_website['href'])
elif business_website is None:
print('no website')
# get page 2
if soup.find('a', {'class': 'paginator-next arrow-next'}) is not None:
soup_next_page = get_next_page(soup, http_headers, proxy_urls)
info_sections = soup_next_page.find_all('li', {'class': 'business-card'})
for info_section in info_sections:
shop_name = info_section.find('h2', {'class': 'title business-name'})
print(shop_name.text)
categories = ', '.join([i.text for i in info_section.find_all('a', {'class': 'category'})])
print(categories)
business_website = info_section.find('a', {'class': 'website listing-cta action'})
if business_website is not None:
print(business_website['href'])
elif business_website is None:
print('no website')
get_proxy_address()
if len(current_proxy) != 0:
print(current_proxy)
base_url = 'https://www.yellowpages.com{}'
current_url = 'https://www.yellowpages.com/search?search_terms=pizza&geo_location_terms=Los+Angeles%2C+CA'
headers = {
'user-agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 13_3_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.5 Mobile/15E148 Safari/604.1',
}
PROXIES = {
'https': f"http://{current_proxy}"
}
get_content(current_url, headers, PROXIES)
truncated output
Current Proxy: {'https': 'http://157.185.161.123:59394'}
status code 200
1.Casa Bianca Pizza Pie
2.Palermo Italian Restaurant
... truncated
Current Proxy: {'https': 'http://157.185.161.123:59394'}
status code 200
31.Johnnie's New York Pizzeria
32.Amalfi Restaurant and Bar
... truncated
UPDATE CODE 07-05-2022 14:07 GMT
I reworked my code posted on 07-01-2022 to output these data elements, business name, business categories and business website.
1.Casa Bianca Pizza Pie
Pizza, Italian Restaurants, Restaurants
http://www.casabiancapizza.com
2.Palermo Italian Restaurant
Pizza, Restaurants, Italian Restaurants
no website
... truncated
UPDATE CODE 07-01-2022
I noted that when using the free proxies errors were being thrown. I added the requests_retry_session function to handle this. I didn't rework all your code, but I did make sure that I could query the site and produce results using a free proxy. You should be able to work my code into yours.
import random
import logging
import requests
from time import sleep
from random import randint
from bs4 import BeautifulSoup
from proxy_checking import ProxyChecker
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from http_request_randomizer.requests.proxy.ProxyObject import Protocol
from http_request_randomizer.requests.proxy.requestProxy import RequestProxy
current_proxy = ''
def requests_retry_session(retries=5,
backoff_factor=0.5,
status_force_list=(500, 502, 504),
session=None,
):
session = session or requests.Session()
retry = Retry(
total=retries,
read=retries,
connect=retries,
backoff_factor=backoff_factor,
status_forcelist=status_force_list,
)
adapter = HTTPAdapter(max_retries=retry)
session.mount('http://', adapter)
session.mount('https://', adapter)
return session
def random_ssl_proxy_address():
try:
# Obtain a list of HTTPS proxies
# Suppress the console debugging output by setting the log level
req_proxy = RequestProxy(log_level=logging.ERROR, protocol=Protocol.HTTPS)
# Obtain a random single proxy from the list of proxy addresses
random_proxy = random.sample(req_proxy.get_proxy_list(), 1)
return random_proxy[0].get_address()
except AttributeError as e:
pass
def proxy_verification(current_proxy_address):
checker = ProxyChecker()
proxy_judge = checker.check_proxy(current_proxy_address)
proxy_status = bool([value for key, value in proxy_judge.items() if key == 'status' and value is True])
return proxy_status
def get_proxy_address():
global current_proxy
random_proxy_address = random_ssl_proxy_address()
current_proxy = random_proxy_address
proxy_status = proxy_verification(random_proxy_address)
if proxy_status is True:
return
else:
print('Looking for a valid proxy address.')
# this sleep timer is helping with some timeout issues
# that were happening when querying
sleep(randint(5, 10))
get_proxy_address()
def fetch_resp(link, http_headers, proxy_url):
response = requests_retry_session().get(link,
headers=http_headers,
allow_redirects=True,
verify=True,
proxies=proxy_url,
timeout=(30, 45)
)
print("status code", response.status_code)
return response
def get_content(link, headers, proxy_urls):
res = fetch_resp(link, headers, proxy_urls)
soup = BeautifulSoup(res.text, "lxml")
info_sections = soup.find_all('li', {'class': 'business-card'})
for info_section in info_sections:
shop_name = info_section.find('h2', {'class': 'title business-name'})
print(shop_name.text)
categories = ', '.join([i.text for i in info_section.find_all('a', {'class': 'category'})])
print(categories)
business_website = info_section.find('a', {'class': 'website listing-cta action'})
if business_website is not None:
print(business_website['href'])
elif business_website is None:
print('no website')
get_proxy_address()
if len(current_proxy) != 0:
print(current_proxy)
base_url = 'https://www.yellowpages.com{}'
current_url = 'https://www.yellowpages.com/search?search_terms=pizza&geo_location_terms=Los+Angeles%2C+CA'
headers = {
'user-agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 13_3_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.5 Mobile/15E148 Safari/604.1',
}
PROXIES = {
'https': f"http://{current_proxy}"
}
get_content(current_url, headers, PROXIES)
PREVIOUS ANSWERS
06-30-2022:
During some testing I found a bug, so I updated my code to handle the bug.
06-28-2022:
You could use a proxy judge, which is used for testing the performance and the anonymity status of a proxy server.
The code below is from one of my previous answers.
import random
import logging
from time import sleep
from random import randint
from proxy_checking import ProxyChecker
from http_request_randomizer.requests.proxy.ProxyObject import Protocol
from http_request_randomizer.requests.proxy.requestProxy import RequestProxy
current_proxy = ''
def random_ssl_proxy_address():
try:
# Obtain a list of HTTPS proxies
# Suppress the console debugging output by setting the log level
req_proxy = RequestProxy(log_level=logging.ERROR, protocol=Protocol.HTTPS)
# Obtain a random single proxy from the list of proxy addresses
random_proxy = random.sample(req_proxy.get_proxy_list(), 1)
return random_proxy[0].get_address()
except AttributeError as e:
pass
def proxy_verification(current_proxy_address):
checker = ProxyChecker()
proxy_judge = checker.check_proxy(current_proxy_address)
proxy_status = bool([value for key, value in proxy_judge.items() if key == 'status' and value is True])
return proxy_status
def get_proxy_address():
global current_proxy
random_proxy_address = random_ssl_proxy_address()
current_proxy = random_proxy_address
proxy_status = proxy_verification(random_proxy_address)
if proxy_status is True:
return
else:
print('Looking for a valid proxy address.')
# this sleep timer is helping with some timeout issues
# that were happening when querying
sleep(randint(5, 10))
get_proxy_address()
get_proxy_address()
if len(current_proxy) != 0:
print(f'Valid proxy address: {current_proxy}')
# output
Valid proxy address: 157.100.12.138:999
I noted today that the Python package HTTP_Request_Randomizer has a couple of Beautiful Soup path problems that need to be modified, because they currently don't work in version 1.3.2 of HTTP_Request_Randomizer.
You need to modified line 27 in FreeProxyParser.py to this:
table = soup.find("table", attrs={"class": "table table-striped table-bordered"})
You need to modified line 27 in SslProxyParser.py to this:
table = soup.find("table", attrs={"class": "table table-striped table-bordered"})
I found another bug that needs to be fixed. This one is in the proxy_checking.py I had to add the line if url != None:
def get_info(self, url=None, proxy=None):
info = {}
proxy_type = []
judges = ['http://proxyjudge.us/azenv.php', 'http://azenv.net/', 'http://httpheader.net/azenv.php', 'http://mojeip.net.pl/asdfa/azenv.php']
if url != None:
try:
response = requests.get(url, headers=headers, timeout=5)
return response
except:
pass
elif proxy != None:
I want to scrape the title and the URL of each Posting at the Forum of the URL, so that when a new Post is created with 1 of the Titles below i'd like to receive a Mail with that Link of the Post.
Please do not be so harsh with me i'm a beginner with Python and Scraping
I have multiple Problems.
1: at the While(True) Function the "soup" is red underlined with the Error: Undefined variable 'soup'
2: When commenting out the While(True) Function then the Program will not run. I get no error.
3: When there is a new Posting with one of my Criterias, how do I get the URL of that Post?
Titles
def Jeti_DC_16
def Jeti_DC_16_v2
def Jeti_DS_16
def Jeti_DS16_v2
My FullCode
from requests import get
from bs4 import BeautifulSoup
import re
import smtplib
import time
import lxml
import pprint
import json
URL = 'https://www.rc-network.de/forums/biete-rc-elektronik-zubeh%C3%B6r.135/'
def scrape_page_metadata(URL):
headers = {
"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36'}
pp = pprint.PrettyPrinter(indent=4)
response = get(URL, headers=headers)
soup = BeautifulSoup(response.content, "lxml")
metadata = {
'Jeti_DC_16': Jeti_DC_16(soup, URL),
'jeti_dc_16_2': Jeti_DC_16_v2(soup, URL),
'jeti_ds_16': Jeti_DS_16(soup, URL),
'jeti_ds_16_2': Jeti_DS_16_v2(soup, URL)
}
pp.pprint(metadata)
return metadata
def Jeti_DC_16(soup, URL):
jeti_dc_16 = None
if soup.name.string:
jeti_dc_16 = soup.title.string
elif soup.find_all("div", class_='structItem-title'):
jeti_dc_16 = soup.find_all(
"div", class_='structItem-title').get('text')
else:
jeti_dc_16 = URL.split('//')[1]
return jeti_dc_16.split('/')[0].rsplit('.')[1].capitalize()
return jeti_dc_16
def Jeti_DC_16_v2(soup, URL):
jeti_dc_16_v2 = None
if soup.name.string:
jeti_dc_16_v2 = soup.title.string
elif soup.find_all("div", class_='structItem-title'):
jeti_dc_16_v2 = soup.find_all(
"div", class_='structItem-title').get('text')
else:
jeti_dc_16_v2 = URL.split('//')[1]
return jeti_dc_16_v2.split('/')[0].rsplit('.')[1].capitalize()
return jeti_dc_16_v2
def Jeti_DS_16(soup, URL):
jeti_ds_16 = None
if soup.jeti_ds_16.string:
jeti_ds_16 = soup.jeti_ds_16.string
elif soup.find_all("div", class_='structItem-title'):
jeti_ds_16 = soup.find_all(
"div", class_='structItem-title').get('text')
else:
jeti_ds_16 = URL.split('//')[1]
return jeti_ds_16.split('/')[0].rsplit('.')[1].capitalize()
return jeti_ds_16
def Jeti_DS_16_v2(soup, URL):
jeti_ds_16_v2 = None
if soup.name.string:
jeti_ds_16_v2 = soup.title.string
elif soup.find_all("div", class_='structItem-title'):
jeti_ds_16_v2 = soup.find_all(
"div", class_='structItem-title').get('text')
else:
jeti_dc_16_v2 = URL.split('//')[1]
return jeti_dc_16_v2.split('/')[0].rsplit('.')[1].capitalize()
return jeti_ds_16_v2
# search_for_class = soup.find_all(
# 'div', class_='structItem-title')
# Jeti_DS_16 = soup.find_all(text="Jeti DS 16")
# Jeti_DS_16_v2 = soup.find_all(text="Jeti DS 16 2")
# Jeti_DC_16 = soup.find_all(text="Jeti DC 16")
# Jeti_DC_16_v2 = soup.find_all(text="Jeti DC 16 2")
if(Jeti_DC_16, Jeti_DC_16_v2, Jeti_DS_16, Jeti_DS_16_v2):
send_mail()
# # print('Die Nummer {0} {1} {2} {3} wurden gezogen'.format(
# # Jeti_DC_16, Jeti_DC_16_v2, Jeti_DS_16, Jeti_DS_16_v2))
# for i in soup.find_all('div', attrs={'class': 'structItem-title'}):
# print(i.a['href'])
# first_result = search_for_class[2]
# print(first_result.text)
# print(Jeti_DC_16, Jeti_DC_16_v2, Jeti_DS_16, Jeti_DS_16_v2)
def send_mail():
with open('/Users/blackbox/Desktop/SynologyDrive/Programmieren/rc-network/credentials.json', 'r') as myFile:
data = myFile.read()
obj = json.loads(data)
print("test: " + str(obj['passwd']))
server_ssl = smtplib.SMTP_SSL('smtp.gmail.com', 465)
server_ssl.ehlo()
# server.starttls()
# server.ehlo()
server_ssl.login('secure#gmail.com', 'secure')
subject = 'Es gibt ein neuer Post im RC-Network auf deine gespeicherte Anfragen. Sieh in dir an{Link to Post}'
body = 'Sieh es dir an Link: https://www.rc-network.de/forums/biete-rc-elektronik-zubeh%C3%B6r.135/'
msg = f"Subject: {subject}\n\n{body}"
emails = ["secure#gmx.de"]
server_ssl.sendmail(
'secure#gmail.com',
emails,
msg
)
print('e-Mail wurde versendet!')
# server_ssl.quit
while(True):
Jeti_DC_16(soup, URL)
Jeti_DC_16_v2(soup, URL)
Jeti_DS_16(soup, URL)
Jeti_DS_16_v2(soup, URL)
time.sleep(10)
# time.sleep(86400)
You create soup inside scrape_page_metadata and it is local varible which doesn't exist outside scrape_page_metadata. In while-loop you should rather use scrape_page_metadata() instead of functions Jeti_DC_16(), Jeti_DC_16_v2(), Jeti_DS_16(), Jeti_DS_16_v2()
And this functions gives you metadata which you should check instead of if(Jeti_DC_16, Jeti_DC_16_v2, Jeti_DS_16, Jeti_DS_16_v2)
More or less (you have to use correct value in place of ... because I don't know what you want to compare)
while True:
metadata = scrape_page_metadata(URL)
if metadata["Jeti_DC_16"] == ... and metadata["Jeti_DC_16_v2"] == ... and metadata["Jeti_DS_16"] == ... and metadata["Jeti_DS_16_v2"] == ...:
send_mail()
time.sleep(10)
But there are other problems
All your functions Jeti_DC_16, Jeti_DC_16_v2, Jeti_DS_16, Jeti_DS_16_v2 look the same and probably they return the same element. You could use one of them and delete others. Or you should change them and they should search different elements.
Probably you would have to use more print() to see values in variables and which part of code is executed because I think this code needs a lot changes yet.
For example find_all() gives list with results and you can't use get() which needs single element. You need for-loop to get all titles from all elements
More or less
jeti_ds_16_v2 = soup.find_all("div", class_='structItem-itle')
jeti_ds_16_v2 = [item.get('text') for item in jeti_ds_16_v2]
I'm trying to upload an image captcha to 2captcha API but I'm receiving the error ERROR_ZERO_CAPTCHA_FILESIZE but my file size on my directory is above 60KB why am I still receiving this error?
Am I missing something?
I've tried sending the only captcha image(originally downloaded from the source) but I received the same error or TOO_MANY_BAD_IMAGES. Please help me.
CODE:
from selenium import webdriver
import requests
driverop = webdriver.ChromeOptions()
driverop.add_argument("--start-maximized")
proxy = "118.174.233.45:44061"
driverop = webdriver.ChromeOptions()
driverop.add_argument('--proxy-server=%s' % proxy)
driver = webdriver.Chrome("chromedriver/chromedriver",options=driverop)
driver.get("https://accounts.google.com/o/oauth2/auth/oauthchooseaccount?client_id=717762328687-iludtf96g1hinl76e4lc1b9a82g457nn.apps.googleusercontent.com&scope=profile%20email&redirect_uri=https%3A%2F%2Fstackauth.com%2Fauth%2Foauth2%2Fgoogle&state=%7B%22sid%22%3A1%2C%22st%22%3A%2259%3A3%3Abbc%2C16%3Af9ef16faad8743e2%2C10%3A1609613474%2C16%3Aa2c13dc2511eb0d0%2Ccb47135ca2a3bc9ca4ee712429ddf5c0935588f518c964242057bb74b818d4de%22%2C%22cdl%22%3Anull%2C%22cid%22%3A%22717762328687-iludtf96g1hinl76e4lc1b9a82g457nn.apps.googleusercontent.com%22%2C%22k%22%3A%22Google%22%2C%22ses%22%3A%22e15efb6754f4498991cd1d37a967f325%22%7D&response_type=code&flowName=GeneralOAuthFlow")
driver.find_element_by_id("identifierId").send_keys(EMAIL)
driver.find_element_by_css_selector('.VfPpkd-LgbsSe-OWXEXe-k8QpJ > div:nth-child(3)').click()
driver.save_screenshot("sample.png")
url = 'http://2captcha.com/in.php'
API_KEY = "---"
files = {'file': open('sample.png',"rb")}
data = {'key': API_KEY, 'method': 'post'}
r = requests.post(url, files=files, data=data)
if r.ok:
print(r)
url = "http://2captcha.com/in.php?key="+API_KEY+"&action=get&id="+r.text[3:]
for xr in range(1, 10):
sleep(1.5) # wait 5 sec.
resp = requests.get(url)
if resp.text[0:2] == 'OK':
break
else:
print(resp)
If you use the driver.save_screenshot you will save the current window, according the documentation (WebDriver.save_screenshot). Try to return the element and use the webelement.screenshot method (WebElement.screenshot)
from selenium import webdriver
import requests
driverop = webdriver.ChromeOptions()
driverop.add_argument("--start-maximized")
proxy = "118.174.233.45:44061"
driverop = webdriver.ChromeOptions()
driverop.add_argument('--proxy-server=%s' % proxy)
driver = webdriver.Chrome("chromedriver/chromedriver",options=driverop)
driver.get("https://accounts.google.com/o/oauth2/auth/oauthchooseaccount?client_id=717762328687-iludtf96g1hinl76e4lc1b9a82g457nn.apps.googleusercontent.com&scope=profile%20email&redirect_uri=https%3A%2F%2Fstackauth.com%2Fauth%2Foauth2%2Fgoogle&state=%7B%22sid%22%3A1%2C%22st%22%3A%2259%3A3%3Abbc%2C16%3Af9ef16faad8743e2%2C10%3A1609613474%2C16%3Aa2c13dc2511eb0d0%2Ccb47135ca2a3bc9ca4ee712429ddf5c0935588f518c964242057bb74b818d4de%22%2C%22cdl%22%3Anull%2C%22cid%22%3A%22717762328687-iludtf96g1hinl76e4lc1b9a82g457nn.apps.googleusercontent.com%22%2C%22k%22%3A%22Google%22%2C%22ses%22%3A%22e15efb6754f4498991cd1d37a967f325%22%7D&response_type=code&flowName=GeneralOAuthFlow")
driver.find_element_by_id("identifierId").send_keys(EMAIL)
element = driver.find_element_by_css_selector('.VfPpkd-LgbsSe-OWXEXe-k8QpJ > div:nth-child(3)').click()
element.screenshot("sample.png")
url = 'http://2captcha.com/in.php'
API_KEY = "---"
files = {'file': open('sample.png',"rb")}
data = {'key': API_KEY, 'method': 'post'}
r = requests.post(url, files=files, data=data)
if r.ok:
print(r)
url = "http://2captcha.com/in.php?key="+API_KEY+"&action=get&id="+r.text[3:]
for xr in range(1, 10):
sleep(1.5) # wait 5 sec.
resp = requests.get(url)
if resp.text[0:2] == 'OK':
break
else:
print(resp)
Regarding your comment, I think your problem is using 2captcha API?
If so, instead of using request module, try their in-house API TwoCaptcha.
Install it by: pip3 install 2captcha-python
I have a snippet here that you can try to upload your sample:
import sys
import os
sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))
from twocaptcha import TwoCaptcha
api_key = os.getenv('APIKEY_2CAPTCHA', 'YOUR_API_KEY')
solver = TwoCaptcha(api_key)
try:
result = solver.normal('path/to/captcha.jpg')
except Exception as e:
sys.exit(e)
else:
sys.exit('solved: ' + str(result))
No need to use selenium and more lines of code to solve captcha. Just use the below short code, it will resolve and provide the response token.
Note: It will handle, image selection as well (car, bus, ship, truck and so on)
-- sitekey - inspect element and find data-sitekey attribute, you will get it
-- url - your webite url
import requests
from twocaptcha import TwoCaptcha
twoCaptcha =TwoCaptcha('xxxxxxxxxxxxxxx') # Your 2captcha API key
captcha_token = twoCaptcha.recaptcha(sitekey='xxxxxxxxxxxxxx',
url='website url')
print(captcha_token)
I'm starting to work with python again after 8 years. I'm trying to do program with BeautifulSoup and a array argument. I pass the array argument medios to the url functions count_words, but it doesn't work. Is there a way fix it or to search a word in multiple websites using BeautifulSoup?
import requests
from bs4 import BeautifulSoup
def count_words(url, the_word):
r = requests.get(url, allow_redirects=False)
soup = BeautifulSoup(r.content, 'lxml')
words = soup.find(text=lambda text: text and the_word in text)
# print(words)
return len(words)
def main():
url = 'https://www.nytimes.com/'
medios = {
'Los Angeles Times': ['http://www.latimes.com/'],
'New York Times' : ['http://www.nytimes.com/'
] }
word = 'Trump'
#count = count_words(url, word)
cuenta = count_words(medios, word)
# print('\n El Sitio: {}\n Contiene {} occurrencias de la palabra: {}'.format(url, count, word))
print('\n La palabra: {} aparece {} occurrencias en el New York Times'.format(word, cuenta))
if __name__ == '__main__':
main()
There are 3 problems here
medios is a dict. Hence, you will have to loop through the keys and values to send it to the method as the method only accepts url string.
BeautifulSoup find method needs a tag name for it to search else it will return None. If you want to count the number of occurrences of the word, then use count on the string.
You have to send User-Agent in the requests code else you will get 403 or 301.
import requests
from bs4 import BeautifulSoup
headers = {'user-agent':"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36"}
def count_words(url, the_word):
r = requests.get(url, headers=headers)
return r.text.lower().count(the_word)
def main():
url = 'https://www.nytimes.com/'
medios = {
'Los Angeles Times': ['http://www.latimes.com/'],
'New York Times' : ['http://www.nytimes.com/']
}
word = 'trump'
for web_name, urls in medios.items():
for url in urls:
cuenta = count_words(url, word)
print('La palabra: {} aparece {} occurrencias en el {}'.format(word, cuenta, web_name))
if __name__ == '__main__':
main()
Output:
La palabra: trump aparece 47 occurrencias en el Los Angeles Times
La palabra: trump aparece 194 occurrencias en el New York Times
You are sending a dictionary to count_words(). You need to send the urls in a loop, or else loop thru the dictionary in count_words().
Perhaps you meant:
cuenta = count_words(url, word)
Update your code to the following:
cuenta = 0
for key in medios:
for url in medios[key]:
cuenta += count_words(url, word)
Basically you should pass the url not a dict, and I am assuming you want to count all the words in all elements in medios.