I'm trying to scrape all the follower names from a profile page using requests module. The problem is when I run the script below, I get the first 20 names over and over again.
The parameters used in post requests only have two keys and values like size:20 and continuation:timestamp. I tried to use the parameters in the right way but still I get the same results repeatedly.
import time
import requests
link = 'https://api-mainnet.rarible.com/marketplace/api/v4/followers'
params = {'user': '0xe744d23107c9c98df5311ff8c1c8637ec3ecf9f3'}
payload = {"size": 20}
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36',
'origin': 'https://rarible.com',
'referer': 'https://rarible.com/'
}
with requests.Session() as s:
s.headers.update(headers)
while True:
res = s.post(link,params=params,json=payload)
print(s.headers)
for item in res.json():
print(item['owner'].get('name',''))
payload['continuation'] = f"{int(time.time() * 1000)}"
time.sleep(2)
How can I parse all the follower names from that page using requests?
Your next continuation value is in X-CONTINUATION response header, so this will work when increasing size in payload doesn't:
import requests
link = 'https://api-mainnet.rarible.com/marketplace/api/v4/followers'
params = {'user': '0xe744d23107c9c98df5311ff8c1c8637ec3ecf9f3'}
payload = {"size": 20}
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36',
'origin': 'https://rarible.com',
'referer': 'https://rarible.com/'
}
res = requests.post(link, headers=headers, params=params, json=payload)
print(res.headers["X-CONTINUATION"])
while True:
for item in res.json():
print(item['owner'].get('name',))
if not res.headers["X-CONTINUATION"]:
break
payload['continuation'] = res.headers["X-CONTINUATION"]
res = requests.post(link, headers=headers, params=params, json=payload)
some api may block you from extracting values more than certain limit and also may show in pages with limits.
For me just increasing the size payload worked with your code.
import time
import requests
link = 'https://api-mainnet.rarible.com/marketplace/api/v4/followers'
params = {'user': '0xe744d23107c9c98df5311ff8c1c8637ec3ecf9f3'}
payload = {"size": 10000}
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36',
'origin': 'https://rarible.com',
'referer': 'https://rarible.com/'
}
with requests.Session() as s:
s.headers.update(headers)
res = s.post(link,params=params,json=payload)
print(len(res.json()))
for item in res.json():
print(item['owner'].get('name',''))
Related
For a small project I tried to get past the google consent page to webscrape the prices found by it. However i could not get the to the good stuff. Sofar i tried the following code, as also proposed by Using python requests with google search.
First try:
import requests
import pandas as pd
from bs4 import BeautifulSoup as bs
#%% get acces
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36',
"cookie": "CONSENT=YES"}
cookie = {
'set-cookie': 'CONSENT=YES+cb.20211212-16-p1.nl+FX+671'
}
url = 'https://www.google.com/search?q=kitesurf+bar&rlz=1C1CAFA_enNL603NL674&sxsrf=AOaemvJF3BPrjeczkCI1e1YotCSJYcKjug:1641909060258&source=lnms&tbm=shop&sa=X&ved=2ahUKEwiBwqjy66n1AhXO-KQKHUKVBSoQ_AUoAXoECAEQAw'
s = requests.Session()
s.get(url,headers=headers)
purl = 'https://consent.google.com/s'
payload = {
'gl': 'NL',
'm': 'false',
'pc': 'srp',
'continue': 'https://www.google.com/search?q=kitesurf+bar&rlz=1C1CAFA_enNL603NL674&sxsrf=AOaemvJF3BPrjeczkCI1e1YotCSJYcKjug:1641909060258&source=lnms&tbm=shop&sa=X&ved=2ahUKEwiBwqjy66n1AhXO-KQKHUKVBSoQ_AUoAXoECAEQAw',
'ca': 'r',
'x': '6',
'v': 'cb.20211212-16-p1.nl+FX+092',
't': 'ADw3F8jHa3HqOqq133-wOkXCYf4K_r-AIA:1641909071268',
'hl': 'nl',
'src': '1'
}
s.post(purl,params=payload,headers=headers)
page = s.get(url, headers=headers, cookies=cookie)
Second try:
import requests
from bs4 import BeautifulSoup
with requests.Session() as s:
# url = f"https://www.google.com/search?q=fitness+wear"
headers = {
"referer":"referer: https://www.google.com/",
"user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36"
}
s.post(url, headers=headers)
response = s.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
print(soup)```
I'm trying to compose a post request for uploading a photo to Instagram, but the third request returns a 403 error to "instagram.com/create/configure/"
My code:
import re
import requests
import urllib.request
from datetime import datetime
link = 'https://www.instagram.com/accounts/login/'
login_url = 'https://www.instagram.com/accounts/login/ajax/'
login = 'login_name'
password = '0000'
time = int(datetime.now().timestamp())
print(time)
payload = {
'username': login,
'enc_password': f'#PWD_INSTAGRAM_BROWSER:0:{time}:{password}',
'queryParams': {},
'optIntoOneTap': 'false'
}
with requests.Session() as s:
r = s.get(link)
a = r.cookies['csrftoken']
print(r.cookies)
print(a)
csrf = re.findall(r"csrf_token\":\"(.*?)\"", r.text)[0]
print(csrf)
r = s.post(login_url, data=payload, headers={
"User-Agent": "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36",
"X-Requested-With": "XMLHttpRequest",
"Referer": "https://www.instagram.com/accounts/login/",
"x-csrftoken": csrf
})
print(r.status_code)
print(r.url)
print(r.text)
print(s.cookies)
r = s.get('https://www.instagram.com/accounts/edit/')
print(login in r.text)
microtime = int(datetime.now().timestamp())
headers = {
"content-type": "image / jpg",
"X-Entity-Name" : f"fb_uploader_{microtime}",
"Offset": "0",
"User-Agent": "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36",
"x-entity-length": "299255",
"X-Instagram-Rupload-Params": f'{{"media_type": 1, "upload_id": {microtime}, "upload_media_height": 1080, "upload_media_width": 1080}}',
"x-csrftoken": csrf,
"x-ig-app-id": "1217981644879628"
}
img = urllib.request.urlopen('https://sun9-64.userapi.com/RVgUHSq9fXrDr8YBJ4a4h9xwN4EQA_8BXuQ5Vg/Mdx3LwawEmY.jpg')
photo = img.read()
r = s.post(f'https://www.instagram.com/rupload_igphoto/fb_uploader_{microtime}', data=open("4.jpg", "rb"), headers=headers)
print(r.text)
headers = {
'Content-Length': '104',
'content-type': 'application/x-www-form-urlencoded',
"origin": "https://www.instagram.com",
"referer": "https://www.instagram.com/create/details/",
'user-agent': "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36",
"x-csrftoken": csrf,
"x-ig-app-id": "1217981644879628",
"X-Requested-With": "XMLHttpRequest"
}
body = {
'upload_id': f'{microtime}',
"caption":'00000000000000000000',
'usertags':'',
'custom_accessibility_caption': '',
'retry_timeout':''
}
r = s.post('https://www.instagram.com/create/configure/', data=body, headers=headers)
print(r.status_code)
print(r.text)
Process:
First authorization, everything is okay with that
The second request is to upload a photo, everything is ok too
The third request, with the parameters of the photo, there is an error ...
error 403 is returned
You need to give the csrf_token and session_id as cookies in the third request.
you can find the complete guide here:
Share a post into your Instagram account using the requests library
I'm using urlopen to extract data from a list of websites right now but keep running in problems with unsuccessful requests. Can anyone help me with it?
I save the website as HTML file
path = "/Users/runyao/Downloads/The Proposition 65 List | OEHHA.html"
soup = BeautifulSoup(open(path), "html.parser")
list = []
pattern = "(chemicals)+([a-z0-9-])"
for counter in range(1,852):
temp = str(soup.find_all('option')[counter])
temptext = temp.replace("\n","/")
temptext = temptext.replace('"',"/")
temptext = temptext.replace(">","")
templist = temptext.split("/")
list.append(templist[-4])
url = 'https://oehha.ca.gov/chemicals/'+ list[1] + '/'
user_agent = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)'
values = {'name' : 'Michael Foord',
'location' : 'Northampton',
'language' : 'Python' }
headers = { 'User-Agent' : user_agent }
data = urllib.parse.urlencode(values)
data = data.encode('ascii')
req = urllib.request.Request(url, data, headers)
with urllib.request.urlopen(req) as response:
the_page = response.read()
import urllib.parse
import urllib.request
url = "https://oehha.ca.gov/chemicals/"+ list[1]
headers = {}
headers['User-Agent'] = "Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.27 Safari/537.17"
req = urllib.request.Request(url, headers = headers)
resp = urllib.request.urlopen(req)
respData = BeautifulSoup(resp, "html.parser")
print(respData)
<html>
<head>
<meta content="noindex,nofollow" name="robots"/>
<script src="/_Incapsula_Resource?SWJIYLWA=5074a744e2e3d891814e9a2dace20bd4,719d34d31c8e3a6e6fffd425f7e032f3">
</script>
<body>
</body></head></html>
And if you try adding more headers , like this....this work for me :
import requests
headers = {
'authority': 'oehha.ca.gov',
'cache-control': 'max-age=0',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36',
'sec-fetch-user': '?1',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
'sec-fetch-site': 'same-origin',
'sec-fetch-mode': 'navigate',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'es-ES,es;q=0.9',
'cookie': 'visid_incap_1712983=Dfo+CVLdTCSzMHcIXEgPSb1y7l0AAAAAQUIPAAAAAAAetg/EHDkFJUigbIo4eaK4; incap_ses_532_1712983=dFrgDpkdqkYCF1u+mQxiB71y7l0AAAAAksMc42V5CJx6OdUZdeHflA==; has_js=1; _ga=GA1.2.626207180.1575908117; _gid=GA1.2.512657448.1575908117; __utma=158387685.626207180.1575908117.1575908117.1575908117.1; __utmc=158387685; __utmz=158387685.1575908117.1.1.utmcsr=(direct)^|utmccn=(direct)^|utmcmd=(none); __utmt=1; __utmt_b=1; _aeaid=5e12b9d6-0171-4fde-8ccf-1bba809a1bb2; aeatstartmessage=true; __utmb=158387685.4.10.1575908117',
'if-none-match': '^\\^1575853039-1^\\^',
}
response = requests.get('https://oehha.ca.gov/chemicals/abiraterone-acetate', headers=headers)
print(response.content)
UPDATE:I have updated the code using urllib , probably have problems because they have blocked the IP from where you make the requests.
from bs4 import BeautifulSoup
import urllib.parse
import urllib.request
url = "https://oehha.ca.gov/chemicals/abiraterone-acetate"
# headers = {}
# headers['User-Agent'] = "Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.27 Safari/537.17"
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36',
'cookie': 'visid_incap_1712983=Dfo+CVLdTCSzMHcIXEgPSb1y7l0AAAAAQUIPAAAAAAAetg/EHDkFJUigbIo4eaK4; incap_ses_532_1712983=dFrgDpkdqkYCF1u+mQxiB71y7l0AAAAAksMc42V5CJx6OdUZdeHflA==; has_js=1; _ga=GA1.2.626207180.1575908117; _gid=GA1.2.512657448.1575908117; __utma=158387685.626207180.1575908117.1575908117.1575908117.1; __utmc=158387685; __utmz=158387685.1575908117.1.1.utmcsr=(direct)^|utmccn=(direct)^|utmcmd=(none); __utmt=1; __utmt_b=1; _aeaid=5e12b9d6-0171-4fde-8ccf-1bba809a1bb2; aeatstartmessage=true; __utmb=158387685.4.10.1575908117',
}
req = urllib.request.Request(url, data=None, headers = headers)
resp = urllib.request.urlopen(req)
respData = BeautifulSoup(resp,"html.parser",from_encoding="iso-8859-1")
print(respData)
Result :
im trying to get specific data in json response from spotify. My current code is:
with requests.Session()as(c):
url = 'https://accounts.spotify.com/en/login?continue=https:%2F%2Fwww.spotify.com%2Fint%2Faccount%2Foverview%2F'
headers = {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'}
page = c.get(url, headers=headers)
CSRF = page.cookies['csrf_token']
headers = {'Accept': '*/*', 'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 10_0_1 like Mac OS X) AppleWebKit/602.1.50 (KHTML, like Gecko) Version/10.0 Mobile/14A403 Safari/602.1',
'Referer': 'https://accounts.spotify.com/en/login/?continue=https:%2F%2Fwww.spotify.com%2Fus%2Fgooglehome%2Fregister%2F&_locale=en-US'}
url = 'https://accounts.spotify.com/api/login'
login_data = {
'remember': 'true',
'username': USER,
'password': PASS,
'csrf_token': CSRF
}
cookies = dict(__bon='MHwwfC0xNDAxNTMwNDkzfC01ODg2NDI4MDcwNnwxfDF8MXwx')
login = c.post(url, headers=headers, data=login_data, cookies=cookies)
if '{"displayName":"' in login.text:
url = 'https://www.spotify.com/us/account/overview/'
capture = c.get(url, headers=headers)
csr = capture.headers['X-Csrf-Token']
json_data = json.loads(login.text)
result = json_data['displayName']
print(result)
when I run this
{"displayName":"John Doe",
"smallImageUrl":"https://scontent.xx.fbcdn.net/v/t1.0-1/p50x50/10407982_10104564418730171_2968639978505808989_n.jpg?_nc_cat=110\u0026_nc_oc=AQmoXCg0tfbf9LuxGWAbpEv-96K57xmie4S3avDrYh3l90g8W-ParNV5mNK0oPU6ERk\u0026_nc_ht=scontent.xx\u0026oh=2fe2149364f012a3c5e43e6d999375ab\u0026oe=5DBD8940",
"largeImageUrl":"https://scontent.xx.fbcdn.net/v/t1.0-1/p200x200/10407982_10104564418730171_2968639978505808989_n.jpg?_nc_cat=110\u0026_nc_oc=AQmoXCg0tfbf9LuxGWAbpEv-96K57xmie4S3avDrYh3l90g8W-ParNV5mNK0oPU6ERk\u0026_nc_ht=scontent.xx\u0026oh=3aec23a8c56be536739ba4cca4e1cc6e\u0026oe=5DBC409D"}
I would like to only print the value: John Doe (SOLVED). but how to get John Doe value in field for discord bot ?
embed=discord.Embed()
embed.add_field(name=undefined, value=undefined, inline=False)
await self.bot.say(embed=embed)
you can use the json method of the requests response object.
login = c.post(url, headers=headers, data=login_data, cookies=cookies)
login_json = login.json()
if "displayName" in login_json:
print(login_json["displayName"])
Use json library
import json
#Your response, probably login.text
s = """{"displayName":"John Doe","smallImageUrl":"https://scontent.xx.fbcdn.net/v/t1.0-1/p50x50/10407982_10104564418730171_2968639978505808989_n.jpg?_nc_cat=110\u0026_nc_oc=AQmoXCg0tfbf9LuxGWAbpEv-96K57xmie4S3avDrYh3l90g8W-ParNV5mNK0oPU6ERk\u0026_nc_ht=scontent.xx\u0026oh=2fe2149364f012a3c5e43e6d999375ab\u0026oe=5DBD8940","largeImageUrl":"https://scontent.xx.fbcdn.net/v/t1.0-1/p200x200/10407982_10104564418730171_2968639978505808989_n.jpg?_nc_cat=110\u0026_nc_oc=AQmoXCg0tfbf9LuxGWAbpEv-96K57xmie4S3avDrYh3l90g8W-ParNV5mNK0oPU6ERk\u0026_nc_ht=scontent.xx\u0026oh=3aec23a8c56be536739ba4cca4e1cc6e\u0026oe=5DBC409D"}"""
dj = json.loads(s)
print(dj["displayName"])
OUTPUT
John Doe
from fake_useragent import UserAgent
import requests
ua = UserAgent()
header = {'User-Agent':str(ua.chrome)}
d = {"query": "/api/v2/details/ip/", "query_entry": "41.219.127.69"}
r = requests.get("https://talosintelligence.com/sb_api/query_lookup/",
data = d, headers=header)
When I run the same result from the main site "talosintelligence.com" and look at the network counsel, that exact URL is responds with a JSON file but a get request from python returns None.
I got it to work by setting the referer header..
import requests
sess = requests.session()
ip_addr = "41.219.127.69"
ret = sess.get('https://talosintelligence.com/sb_api/query_lookup', data={"query": "/api/v2/details/ip/", "query_entry": ip_addr, "offset": 0, "order": "ip asc"}, headers={'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.31 Safari/537.36', 'referer': 'https://talosintelligence.com/reputation_center/lookup?search=' + ip_addr})