I tried to add proxies but it doesn't work
But when I print(json())
the result is always my ip
here is the code
proxy = open("proxies.txt", "r").readlines()
for i in proxy:
i = i.replace("\n", "")
proxies = {'http' : i}
data = {
'login': '***********',
'password': '*********',
}
headers= {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.41 Safari/537.36'}
r = s.post("https://example.com",headers=headers ,proxies=proxies, data=data)
print(r.json())
Related
I am trying to retrieve the html response code of a website redirected by 302. It seams that python3 is not following the temporary redirect and returns the original page instead.
Is there a way to configure the command in order to follow the 302?
import random
import requests
from pprint import pprint
from bs4 import BeautifulSoup
url = 'https://www.zurrose.de/catalogsearch/result?q=11343654'
headers = {
'Access-Control-Allow-Origin': '*',
'Access-Control-Allow-Methods': 'GET',
'Access-Control-Allow-Headers': 'Content-Type',
'Access-Control-Max-Age': '3600',
'Referer': 'https://www.zurrose.de',
'Connection': 'keep-alive'
}
user_agents = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:100.0) Gecko/20100101 Firefox/100.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.64 Safari/537.36,',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.67 Safari/537.36'
]
headers['User-Agent'] = random.choice(user_agents)
r = requests.post(url, headers=headers)
print('start')
print(r.history[0].status_code)
if r.history:
print("Request was redirected")
print(r.history)
for resp in r.history:
print(resp.status_code, resp.url)
print("Final destination:")
print(r.status_code, r.url)
if not r.history:
print(f'No redirect on {url}. Status {r.status_code}. PP URL not found for {sku}')
elif r.history[0].status_code < 300:
print(f'No PP URL retrieved for {sku} on {url}. Status {r.history[0].status_code}')
soup = BeautifulSoup(r.content, 'html.parser')
for i in soup.select('link[rel*=canonical]'):
# print(i['href'])
url_pp = i['href']
print(url_pp)
pprint(r.content)
The method you want to use will not redirect you to the correct page. The matter is that the redirect occurs in cookies. You need to send request in session. I ll show 2 ways how to get the page you need
import requests
from bs4 import BeautifulSoup
def show_product_ajax(sku):
url = f"https://www.zurrose.de/search/ajax/suggest/?q={sku}"
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36'
}
for product in requests.request("GET", url, headers=headers).json():
print(product['title'], product['url'], BeautifulSoup(product['price'], features='lxml').find('span', class_='price').getText())
def show_product_session(sku):
url = f'https://www.zurrose.de/catalogsearch/result?q={sku}'
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36'
}
s = requests.Session()
r = s.get(url, headers=headers)
soup = BeautifulSoup(r.text, features='lxml')
product_url = soup.find('meta', attrs={'itemprop': 'url'}).get('content')
product_title = soup.find('span', attrs={'itemprop': 'name', 'data-ui-id': 'page-title-wrapper'}).getText()
product_price = soup.find('span', class_='price').getText()
print(product_title, product_url, product_price)
show_product_ajax(11343654)
show_product_session(11343654)
Result:
SYNGUT Synbiotikum m.Probiotika u.Prebiot.Beutel https://www.zurrose.de/produkte/syngut-synbiotikum-m-probiotika-u-prebiot-beutel-344764 23,50 €
SYNGUT Synbiotikum m.Probiotika u.Prebiot.Beutel-15 St https://www.zurrose.de/produkte/syngut-synbiotikum-m-probiotika-u-prebiot-beutel-344764?variation=344764_6502482 23,50 €
To send the message on my Telegram channel, I use the following template that works perfectly:
import requests
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36"
}
chat_telegram = ['XXXXXXXXXXXXX']
textalert = f'Test'
botalert = 'YYYYYYYYYYYYYYYYYYYYY'
urlalert = f'https://api.telegram.org/bot{botalert}/sendMessage'
params = {'text':textalert, 'chat_id':chat_telegram, 'parse_mode':'HTML'}
return_request = requests.get(urlalert, headers=headers, params=params)
I've tried using all four of the following ways to try to retrieve the message ID that was sent:
print(return_request['update']['message']['message_id'])
print(return_request['message']['message_id'])
print(return_request.update.message.message_id)
print(return_request.message.message_id)
But none returned positively, how should I proceed to recover the value I need?
Here is the API map
The message id is stored in the response.text part of the response, you can retrieve it via the json method:
import requests
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36"
}
chat_id = ['chat id']
textalert = f'Test'
botalert = 'some token'
urlalert = f'https://api.telegram.org/bot{botalert}/sendMessage'
params = {'text':textalert, 'chat_id':chat_id, 'parse_mode':'HTML'}
response = requests.get(urlalert, headers=headers, params=params)
json_data = response.json()
# get the message id
print('message id', json_data['result']['message_id'])
alternatively, you can retrieve it with the json module:
import json
json_data = json.loads(response.text)
# json_data['result']['message_id']
To send a message to Telegram, I use this template:
import requests
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"}
urlphoto = f'http://127.0.0.1:0001/Home/Site%20de%20Trabalho%20-%20Home.html'
botalert = 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
chatalert = 'yyyyyyyyyyyyyyyy'
urlalert = f"https://api.telegram.org/bot" + botalert + "/sendMessage?text=" + urlphoto + "&chat_id=" + chatalert + "&parse_mode=HTML"
requests.get(urlalert, headers=headers)
But when the message is sent, the link received there does not come together as the %20 is converted into spaces:
How should I proceed so that the link is delivered perfectly like that:
http://127.0.0.1:0001/Home/Site%20de%20Trabalho%20-%20Home.html
Use a parameters dictionary, and the parameters will be encoded correctly for you:
import requests
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"}
urlphoto = f'http://127.0.0.1:0001/Home/Site%20de%20Trabalho%20-%20Home.html'
botalert = 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
chatalert = 'yyyyyyyyyyyyyyyy'
urlalert = f'https://api.telegram.org/bot{botalert}/sendMessage'
params = {'text':urlphoto, 'chat_id':chatalert, 'parse_mode':'HTML'}
requests.get(urlalert, headers=headers, params=params)
You can define urlphoto like this:
urlphoto = f'http://127.0.0.1:0001/Home/Site%20de%20Trabalho%20-%20Home.html'.replace('%20', '%2520')
This will print the percent sign with 20 after it.
Try this:
import requests
from requests.utils import quote
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"}
urlphoto = 'http://127.0.0.1:0001/Home/Site%20de%20Trabalho%20-%20Home.html'
botalert = 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
chatalert = 'yyyyyyyyyyyyyyyy'
urlalert = f"https://api.telegram.org/bot{botalert}/sendMessage"
requests.get(urlalert, params=quote(f"?text={urlphoto}&chat_id={chatalert}&parse_mode=HTML"), headers=headers)
I'm using urlopen to extract data from a list of websites right now but keep running in problems with unsuccessful requests. Can anyone help me with it?
I save the website as HTML file
path = "/Users/runyao/Downloads/The Proposition 65 List | OEHHA.html"
soup = BeautifulSoup(open(path), "html.parser")
list = []
pattern = "(chemicals)+([a-z0-9-])"
for counter in range(1,852):
temp = str(soup.find_all('option')[counter])
temptext = temp.replace("\n","/")
temptext = temptext.replace('"',"/")
temptext = temptext.replace(">","")
templist = temptext.split("/")
list.append(templist[-4])
url = 'https://oehha.ca.gov/chemicals/'+ list[1] + '/'
user_agent = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)'
values = {'name' : 'Michael Foord',
'location' : 'Northampton',
'language' : 'Python' }
headers = { 'User-Agent' : user_agent }
data = urllib.parse.urlencode(values)
data = data.encode('ascii')
req = urllib.request.Request(url, data, headers)
with urllib.request.urlopen(req) as response:
the_page = response.read()
import urllib.parse
import urllib.request
url = "https://oehha.ca.gov/chemicals/"+ list[1]
headers = {}
headers['User-Agent'] = "Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.27 Safari/537.17"
req = urllib.request.Request(url, headers = headers)
resp = urllib.request.urlopen(req)
respData = BeautifulSoup(resp, "html.parser")
print(respData)
<html>
<head>
<meta content="noindex,nofollow" name="robots"/>
<script src="/_Incapsula_Resource?SWJIYLWA=5074a744e2e3d891814e9a2dace20bd4,719d34d31c8e3a6e6fffd425f7e032f3">
</script>
<body>
</body></head></html>
And if you try adding more headers , like this....this work for me :
import requests
headers = {
'authority': 'oehha.ca.gov',
'cache-control': 'max-age=0',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36',
'sec-fetch-user': '?1',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
'sec-fetch-site': 'same-origin',
'sec-fetch-mode': 'navigate',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'es-ES,es;q=0.9',
'cookie': 'visid_incap_1712983=Dfo+CVLdTCSzMHcIXEgPSb1y7l0AAAAAQUIPAAAAAAAetg/EHDkFJUigbIo4eaK4; incap_ses_532_1712983=dFrgDpkdqkYCF1u+mQxiB71y7l0AAAAAksMc42V5CJx6OdUZdeHflA==; has_js=1; _ga=GA1.2.626207180.1575908117; _gid=GA1.2.512657448.1575908117; __utma=158387685.626207180.1575908117.1575908117.1575908117.1; __utmc=158387685; __utmz=158387685.1575908117.1.1.utmcsr=(direct)^|utmccn=(direct)^|utmcmd=(none); __utmt=1; __utmt_b=1; _aeaid=5e12b9d6-0171-4fde-8ccf-1bba809a1bb2; aeatstartmessage=true; __utmb=158387685.4.10.1575908117',
'if-none-match': '^\\^1575853039-1^\\^',
}
response = requests.get('https://oehha.ca.gov/chemicals/abiraterone-acetate', headers=headers)
print(response.content)
UPDATE:I have updated the code using urllib , probably have problems because they have blocked the IP from where you make the requests.
from bs4 import BeautifulSoup
import urllib.parse
import urllib.request
url = "https://oehha.ca.gov/chemicals/abiraterone-acetate"
# headers = {}
# headers['User-Agent'] = "Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.27 Safari/537.17"
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36',
'cookie': 'visid_incap_1712983=Dfo+CVLdTCSzMHcIXEgPSb1y7l0AAAAAQUIPAAAAAAAetg/EHDkFJUigbIo4eaK4; incap_ses_532_1712983=dFrgDpkdqkYCF1u+mQxiB71y7l0AAAAAksMc42V5CJx6OdUZdeHflA==; has_js=1; _ga=GA1.2.626207180.1575908117; _gid=GA1.2.512657448.1575908117; __utma=158387685.626207180.1575908117.1575908117.1575908117.1; __utmc=158387685; __utmz=158387685.1575908117.1.1.utmcsr=(direct)^|utmccn=(direct)^|utmcmd=(none); __utmt=1; __utmt_b=1; _aeaid=5e12b9d6-0171-4fde-8ccf-1bba809a1bb2; aeatstartmessage=true; __utmb=158387685.4.10.1575908117',
}
req = urllib.request.Request(url, data=None, headers = headers)
resp = urllib.request.urlopen(req)
respData = BeautifulSoup(resp,"html.parser",from_encoding="iso-8859-1")
print(respData)
Result :
from fake_useragent import UserAgent
import requests
ua = UserAgent()
header = {'User-Agent':str(ua.chrome)}
d = {"query": "/api/v2/details/ip/", "query_entry": "41.219.127.69"}
r = requests.get("https://talosintelligence.com/sb_api/query_lookup/",
data = d, headers=header)
When I run the same result from the main site "talosintelligence.com" and look at the network counsel, that exact URL is responds with a JSON file but a get request from python returns None.
I got it to work by setting the referer header..
import requests
sess = requests.session()
ip_addr = "41.219.127.69"
ret = sess.get('https://talosintelligence.com/sb_api/query_lookup', data={"query": "/api/v2/details/ip/", "query_entry": ip_addr, "offset": 0, "order": "ip asc"}, headers={'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.31 Safari/537.36', 'referer': 'https://talosintelligence.com/reputation_center/lookup?search=' + ip_addr})