discord.app_commands.errors.CommandInvokeError: Command 'sadcommand' raised an exception: TypeError: 'CloudScraper' object is not callable
def paste_clicked_spots(self,auth):
bunda = cs.create_scraper()("https://rest-bf.blox.land/games/mines/history",
headers={"x-auth-token":auth, "user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64)"},
params={ 'size': '50','page': '0',})
if bunda.status_code == 201:
return bunda.json()['data']
else:
return False
def paste_game(self,auth):
bunda = cs.create_scraper()("https://rest-bf.blox.land/games/mines/history",
headers={"x-auth-token":auth, "user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64)"},
params={ 'size': '1','page': '0',})
if bunda.status_code == 201:
return bunda.json()['data'][0]['uncoveredLocations']
else:
return False
Related
I am trying to retrieve the html response code of a website redirected by 302. It seams that python3 is not following the temporary redirect and returns the original page instead.
Is there a way to configure the command in order to follow the 302?
import random
import requests
from pprint import pprint
from bs4 import BeautifulSoup
url = 'https://www.zurrose.de/catalogsearch/result?q=11343654'
headers = {
'Access-Control-Allow-Origin': '*',
'Access-Control-Allow-Methods': 'GET',
'Access-Control-Allow-Headers': 'Content-Type',
'Access-Control-Max-Age': '3600',
'Referer': 'https://www.zurrose.de',
'Connection': 'keep-alive'
}
user_agents = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:100.0) Gecko/20100101 Firefox/100.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.64 Safari/537.36,',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.67 Safari/537.36'
]
headers['User-Agent'] = random.choice(user_agents)
r = requests.post(url, headers=headers)
print('start')
print(r.history[0].status_code)
if r.history:
print("Request was redirected")
print(r.history)
for resp in r.history:
print(resp.status_code, resp.url)
print("Final destination:")
print(r.status_code, r.url)
if not r.history:
print(f'No redirect on {url}. Status {r.status_code}. PP URL not found for {sku}')
elif r.history[0].status_code < 300:
print(f'No PP URL retrieved for {sku} on {url}. Status {r.history[0].status_code}')
soup = BeautifulSoup(r.content, 'html.parser')
for i in soup.select('link[rel*=canonical]'):
# print(i['href'])
url_pp = i['href']
print(url_pp)
pprint(r.content)
The method you want to use will not redirect you to the correct page. The matter is that the redirect occurs in cookies. You need to send request in session. I ll show 2 ways how to get the page you need
import requests
from bs4 import BeautifulSoup
def show_product_ajax(sku):
url = f"https://www.zurrose.de/search/ajax/suggest/?q={sku}"
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36'
}
for product in requests.request("GET", url, headers=headers).json():
print(product['title'], product['url'], BeautifulSoup(product['price'], features='lxml').find('span', class_='price').getText())
def show_product_session(sku):
url = f'https://www.zurrose.de/catalogsearch/result?q={sku}'
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36'
}
s = requests.Session()
r = s.get(url, headers=headers)
soup = BeautifulSoup(r.text, features='lxml')
product_url = soup.find('meta', attrs={'itemprop': 'url'}).get('content')
product_title = soup.find('span', attrs={'itemprop': 'name', 'data-ui-id': 'page-title-wrapper'}).getText()
product_price = soup.find('span', class_='price').getText()
print(product_title, product_url, product_price)
show_product_ajax(11343654)
show_product_session(11343654)
Result:
SYNGUT Synbiotikum m.Probiotika u.Prebiot.Beutel https://www.zurrose.de/produkte/syngut-synbiotikum-m-probiotika-u-prebiot-beutel-344764 23,50 €
SYNGUT Synbiotikum m.Probiotika u.Prebiot.Beutel-15 St https://www.zurrose.de/produkte/syngut-synbiotikum-m-probiotika-u-prebiot-beutel-344764?variation=344764_6502482 23,50 €
I tried to add proxies but it doesn't work
But when I print(json())
the result is always my ip
here is the code
proxy = open("proxies.txt", "r").readlines()
for i in proxy:
i = i.replace("\n", "")
proxies = {'http' : i}
data = {
'login': '***********',
'password': '*********',
}
headers= {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.41 Safari/537.36'}
r = s.post("https://example.com",headers=headers ,proxies=proxies, data=data)
print(r.json())
To send a message to Telegram, I use this template:
import requests
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"}
urlphoto = f'http://127.0.0.1:0001/Home/Site%20de%20Trabalho%20-%20Home.html'
botalert = 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
chatalert = 'yyyyyyyyyyyyyyyy'
urlalert = f"https://api.telegram.org/bot" + botalert + "/sendMessage?text=" + urlphoto + "&chat_id=" + chatalert + "&parse_mode=HTML"
requests.get(urlalert, headers=headers)
But when the message is sent, the link received there does not come together as the %20 is converted into spaces:
How should I proceed so that the link is delivered perfectly like that:
http://127.0.0.1:0001/Home/Site%20de%20Trabalho%20-%20Home.html
Use a parameters dictionary, and the parameters will be encoded correctly for you:
import requests
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"}
urlphoto = f'http://127.0.0.1:0001/Home/Site%20de%20Trabalho%20-%20Home.html'
botalert = 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
chatalert = 'yyyyyyyyyyyyyyyy'
urlalert = f'https://api.telegram.org/bot{botalert}/sendMessage'
params = {'text':urlphoto, 'chat_id':chatalert, 'parse_mode':'HTML'}
requests.get(urlalert, headers=headers, params=params)
You can define urlphoto like this:
urlphoto = f'http://127.0.0.1:0001/Home/Site%20de%20Trabalho%20-%20Home.html'.replace('%20', '%2520')
This will print the percent sign with 20 after it.
Try this:
import requests
from requests.utils import quote
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"}
urlphoto = 'http://127.0.0.1:0001/Home/Site%20de%20Trabalho%20-%20Home.html'
botalert = 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
chatalert = 'yyyyyyyyyyyyyyyy'
urlalert = f"https://api.telegram.org/bot{botalert}/sendMessage"
requests.get(urlalert, params=quote(f"?text={urlphoto}&chat_id={chatalert}&parse_mode=HTML"), headers=headers)
I am scraping a number of websites for job postings using Scrapy. If a page on a site fits my requirements, I store a link to the page in a database. No issue there. I've also created a script that goes through each link in the database and pings the URL. If it returns a 404, it gets deleted. The issue I'm having is that some sites are returning 403 errors when I do the deletion check. What's weird is that they all allow scraping, but they are blocking the check. This is the script I'm using to do the deletion check:
from pymongo import MongoClient
import requests
import urllib3
from operator import itemgetter
import random
import time
client = MongoClient("path-to-mongo")
db = client["mongoDB"]
col = db['mongoCollection']
openings = list(col.find())
sorted_openings = sorted(openings, key=itemgetter('Company'))
del_counter = 0
user_agents = ["Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; .NET CLR 1.1.4322)",
"Mozilla/5.0 CK={} (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.157 Safari/537.36",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1)",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/605.1.15 (KHTML, like Gecko)",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/605.1.15 (KHTML, like Gecko)",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/605.1.15 (KHTML, like Gecko)",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.1 Safari/605.1.15",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/603.3.8 (KHTML, like Gecko)"]
headers = {"User-Agent": user_agents[random.randint(0,11)]}
counter = 0
del_counter = 0
passed_counter = 0
deleted_links = []
passed_links = []
forbidden = []
for item in sorted_openings:
try:
if requests.get(item['Link'], allow_redirects=False, verify=False, headers=headers).status_code == 200:
print(str(requests.get(item['Link'])) + ' ' + item['Link'])
counter += 1
print(counter)
elif requests.get(item['Link'], allow_redirects=False, verify=False, headers=headers).status_code == 304:
print(requests.get(item['Link']))
counter += 1
print(counter)
elif requests.get(item['Link'], allow_redirects=False, verify=False, headers=headers).status_code == 403:
forbidden.append(item['Link'])
print(requests.get(item['Link']))
counter += 1
print(counter)
else:
db.openings.remove(item)
deleted_links.append(item['Link'])
del_counter += 1
counter += 1
print('Deleted ' + item['Link'])
print(counter)
except:
pass
passed_links.append(item['Link'])
passed_counter += 1
counter += 1
print('Passed link ' + item['Link'])
print(counter)
You send the request in each condition, send one request and store the result in a value then check with the condition.
from pymongo import MongoClient
import requests
import urllib3
from operator import itemgetter
import random
import time
client = MongoClient("path-to-mongo")
db = client["mongoDB"]
col = db['mongoCollection']
openings = list(col.find())
sorted_openings = sorted(openings, key=itemgetter('Company'))
del_counter = 0
user_agents = ["Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; .NET CLR 1.1.4322)",
"Mozilla/5.0 CK={} (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.157 Safari/537.36",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1)",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/605.1.15 (KHTML, like Gecko)",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/605.1.15 (KHTML, like Gecko)",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/605.1.15 (KHTML, like Gecko)",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.1 Safari/605.1.15",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/603.3.8 (KHTML, like Gecko)"]
headers = {"User-Agent": user_agents[random.randint(0,11)]}
counter = 0
del_counter = 0
passed_counter = 0
deleted_links = []
passed_links = []
forbidden = []
for item in sorted_openings:
try:
response = requests.get(item['Link'], allow_redirects=False, verify=False, headers=headers)
if response.status_code == 200:
print(str(response) + ' ' + item['Link'])
counter += 1
print(counter)
elif response.status_code == 304:
print(response)
counter += 1
print(counter)
elif response.status_code == 403:
forbidden.append(item['Link'])
print(response))
counter += 1
print(counter)
else:
db.openings.remove(item)
deleted_links.append(item['Link'])
del_counter += 1
counter += 1
print('Deleted ' + item['Link'])
print(counter)
except:
passed_links.append(item['Link'])
passed_counter += 1
counter += 1
print('Passed link ' + item['Link'])
print(counter)
I am having some hard time retrieving information from Amazon pages with a small scraping script. Below is my code:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import random
import time
from bs4 import BeautifulSoup
sleep_time_min = 5
sleep_time_max = 10
### INFORMATION FOR PROXY, UA & INFO ROTATION ###
user_agent_list = ['Mozilla/5.0 CK={} (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.157 Safari/537.36',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322)','Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1)',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.2.149.29 Safari/525.13',
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.2.149.27 Safari/525.13']
opts = Options()
user_agent = random.choice(user_agent_list)
opts.add_argument("user-agent="+user_agent)
driver = webdriver.Chrome(executable_path='XXXXXXXX', options=opts)
driver.get('https://www.amazon.com/gp/product/B00J4B0S4O')
soup = BeautifulSoup(driver.page_source, 'lxml')
sleep_time_range = range(sleep_time_min,sleep_time_max)
sleep_time = random.choice(sleep_time_range)
time.sleep(sleep_time)
#Extract seller rank & sales category
try:
rank = driver.find_element_by_xpath('//div[#id="detailBullets_feature_div"]/ul[#class="a-unordered-list a-nostyle a-vertical a-spacing-none detail-bullet-list"]/li/span[#class="a-list-item"]/span[#class="a-text-bold"]')
#rank = driver.find_element_by_xpath('//div[#id="detail-bullets_feature_div"]').text
#rank = driver.find_element_by_xpath('//div[#id="a-page"]').text
except:
rank = "NA"
print(rank)
driver.close()
So basically I am trying to retrieve the following info on the page:
Best Sellers Rank: #711 in Grocery & Gourmet Food (See Top 100 in Grocery & Gourmet Food)
#1 in Grapeseed Oils
Then slice it so I can store ranks & categories into variables.
Here is my problem, my xpath keeps returning empty results despite all my efforts. I have left so other xpath I used in the code (i.e. high level divs, but the results is unfortunately the same).
Unfortunately I fail to see why those outcomes remain empty. Do you have any idea?
Thanks a lot for the help here
I ended up doing it the ugly way, not very beautiful but working :)
try:
rank_main = soup.find_all('span','a-list-item')
rank_main = str(rank_main)
rank_main = rank_main.split("(<a href")[0]
rank_main = rank_main.split("#")[1]
rank_main = rank_main.replace("amp;","")
rank1bis_nb = rank_main.split('in')[0]
rank1bis_cat = rank_main.split('in ')[1]
except:
rank1bis_nb = "NA"
rank1bis_cat = "NA"
try:
rank_raw = soup.find_all('ul','a-unordered-list a-nostyle a-vertical zg_hrsr')
rank_raw = str(rank_raw)
rank_raw = rank_raw.replace("[","")
rank_raw = rank_raw.replace("]","")
rank_raw = rank_raw.replace("#","")
rank_raw = re.sub('<[^>]+>', '', rank_raw)
rank_raw = rank_raw.strip()
rank_raw = rank_raw.replace("amp;","")
rank2bis_nb = rank_raw.split('in')[0]
rank2bis_cat = rank_raw.split('in ')[1]
except:
rank2bis_nb = "NA"
rank2bis_cat = "NA"