I'm working on scraping from barchart.com using modified code from this stack overflow question:
The header and payload information are from the XHR of the website I was attempting to scrape.
from urllib.parse import unquote
geturl=r'https://www.barchart.com/options/highest-implied-volatility'
apiurl=r'https://www.barchart.com/proxies/core-api/v1/quotes/get'
getheaders={
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'en-US,en;q=0.9',
'cache-control': 'max-age=0',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36'
}
getpay={
'page': 'all'
}
s=requests.Session()
r=s.get(geturl,params=getpay, headers=getheaders)
headersIV = {
'method': 'GET',
'scheme': 'https',
'authority': 'www.barchart.com',
'Host' : 'www.barchart.com',
'Accept': 'application/json',
'Accept-Encoding': 'gzip, deflate, br',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Safari/605.1.15',
'Accept-Language': 'en-us',
'Referer': 'https://www.barchart.com/options/highest-implied-volatility',
'Connection': 'keep-alive',
'X-XSRF-TOKEN': 'eyJpdiI6Ik8vQTBkcGxZVVF1aG5QeE9TUnk5L3c9PSIsInZhbHVlIjoiMDd6STJyM1FPZEtMMFdLNEcrVjNNWUMva1l3WWxwblMvdEFZMEIzSllzalFySGFoblcyRzgrRmNZa1RMRHdZcTlBVExQTjBQUEhVdTVaNWhMZlJ0ZFM4c3ZaeHMvVmptM2FGQXJobnM1WTl1REx1d3M1eDI2RUc2SEtHY2wzTnUiLCJtYWMiOiIyNGExYjI3N2JkOGRiZGEwYjY4MTQ3OGFiYmYxZGE3ZmJhZmQyMDQwM2NiZTc0YTMzZDFkNjI4ZGIwZmY2YTU0In0=',
'path': '/proxies/core-api/v1/options/get?fields=symbol%2CbaseSymbol%2CbaseLastPrice%2CbaseSymbolType%2CsymbolType%2CstrikePrice%2CexpirationDate%2CdaysToExpiration%2CbidPrice%2Cmidpoint%2CaskPrice%2ClastPrice%2Cvolume%2CopenInterest%2CvolumeOpenInterestRatio%2Cvolatility%2CtradeTime%2CsymbolCode%2ChasOptions&orderBy=volatility&baseSymbolTypes=stock&between(lastPrice%2C.10%2C)=&between(daysToExpiration%2C15%2C)=&between(tradeTime%2C2021-10-21%2C2021-10-22)=&orderDir=desc&between(volatility%2C60%2C)=&limit=200&between(volume%2C500%2C)=&between(openInterest%2C100%2C)=&in(exchange%2C(AMEX%2CNASDAQ%2CNYSE))=&meta=field.shortName%2Cfield.type%2Cfield.description&hasOptions=true&raw=1',
}
payloadIV={
'fields': 'symbol,baseSymbol,baseLastPrice,baseSymbolType,symbolType,strikePrice,expirationDate,daysToExpiration,bidPrice,midpoint,askPrice,lastPrice,volume,openInterest,volumeOpenInterestRatio,volatility,tradeTime,symbolCode,hasOptions',
'orderBy': 'volatility',
'baseSymbolTypes': 'stock',
'between(lastPrice,.10,)':'',
'between(daysToExpiration,15,)':'',
'between(tradeTime,2021-10-21,2021-10-22)':'',
'orderDir': 'desc',
'between(volatility,60,)':'',
'limit': '200',
'between(volume,500,)':'',
'between(openInterest,100,)':'',
'in(exchange,(AMEX,NASDAQ,NYSE))':'',
'meta': 'field.shortName,field.type,field.description',
'hasOptions': 'true',
'raw': '1'
}
r=s.get(apiurl,params=payloadIV,headers=headersIV)
j=r.json()
print(j)
It returns this error message: {'error': {'message': 'Internal error.', 'code': 500}}
I am pretty new to scraping data using API and XHR data. I think I might be doing many things correctly right now but I don't know where I might be making the mistake.
Related
i try to use the api-endpoint from this site:
https://horoguides.com/hk/watch_finder
I searched for the api-endpoint in the network-tab and try to rebuild this api-access with the following code:
import requests
url = "https://horoguides.com/hk/ajaj/watch/searchWatches"
payload = {
"addLimit": "LIMIT 0, 20",
"addOrder": "ORDER BY establish DESC",
}
headers = {
'Accept': "application/json, text/javascript, */*; q=0.01",
'Accept-Language': "de-DE,de;q=0.9,en-US;q=0.8,en;q=0.7",
'Connection': "keep-alive",
'Content-Type': "multipart/form-data; boundary=---011000010111000001101001",
'Cookie': "PHPSESSID=siob5k70qu4gh8bkio07qtocv3; _gid=GA1.2.40295814.1663575664; __gads=ID=2fc582d62ff2a986-223e4e8c26ce00a9:T=1663575664:RT=1663575664:S=ALNI_MaTX_1U4CELXasmH0td3MvCRQ5S5Q; _gat_UA-90322481-1=1; _gat_gtag_UA_90322481_1=1; _ga_6Z9E9PKG02=GS1.1.1663594500.3.1.1663594710.0.0.0; _ga=GA1.1.699639573.1663575664",
'Origin': "https://horoguides.com",
'Referer': "https://horoguides.com/hk/watch_finder",
'Sec-Fetch-Dest': "empty",
'Sec-Fetch-Mode': "cors",
'Sec-Fetch-Site': "same-origin",
'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36",
'X-Requested-With': "XMLHttpRequest",
'sec-ch-ua': "^\^Chromium^^;v=^\^104^^, ^\^"
}
resp = requests.request("POST", url, json=payload, headers=headers)
print(resp.status_code)
respJSON = resp.json()
print(respJSON)
But as response i only get:
200
{'status': 'invalid'}
Why is this reponse from the api-endpoint not working?
I also tried to run this in Insomnia and get the same result.
You need to fix the payload. The following code works:
import requests
url = "https://horoguides.com/hk/ajaj/watch/searchWatches"
payload = {
"addLimit": "LIMIT 0, 20",
"addOrder": "ORDER BY establish DESC",
'lang': 'hk',
'ajaxID': 'searchWatches'
}
headers = {
'Accept': "application/json, text/javascript, */*; q=0.01",
'Accept-Language': "de-DE,de;q=0.9,en-US;q=0.8,en;q=0.7",
'Connection': "keep-alive",
'Origin': "https://horoguides.com",
'Referer': "https://horoguides.com/hk/watch_finder",
'Sec-Fetch-Dest': "empty",
'Sec-Fetch-Mode': "cors",
'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36",
'X-Requested-With': "XMLHttpRequest"
}
resp = requests.request("POST", url, data=payload, headers=headers)
print(resp.status_code)
respJSON = resp.json()
print(respJSON)
Result in terminal:
200
{'act': 'watch/searchWatches', 'status': 'success', 'getData': {'a5124': {'id': '5124', 'name': '116610-LN-0001', 'url_name': '116610-ln-97200', 'establish': '2014', 'w_brand_id': '39', 'w_brand_abbr': '', 'w_brand_name': 'ROLEX', 'w_brand_urlname': 'rolex', 'w_brand_localname': '勞力士', 'hype_default_currency': 'NT$', 'w_series_name': 'SUBMARINER', 'w_series_urlname':[....]
For requests documentation, see https://requests.readthedocs.io/en/latest/
I want to send reset link to email adresses but i can't pass the captcha. I have a capmonster account to resolve captchas, tried Selenium before but i couldn't.
This is my code:
import requests
import json
s = requests.Session()
Grab = s.get("https://www.instagram.com/accounts/login/")
Headd = {
'accept': '*/*',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'tr-TR,tr;q=0.9,en-US;q=0.8,en;q=0.7',
'content-length': '104',
'content-type': 'application/x-www-form-urlencoded',
'origin': 'https://www.instagram.com',
'referer': 'https://www.instagram.com/accounts/password/reset/',
'user-agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Mobile Safari/537.36',
'x-csrftoken': Grab.cookies.get_dict()['csrftoken'],
'x-instagram-ajax': 'c6160c6b689a',
'x-requested-with': 'XMLHttpRequest'
}
LoginData = {
"email_or_username": "example#outlook.com",
"recaptcha_challenge_field": ""
}
AccLogin = s.post('https://www.instagram.com/accounts/account_recovery_send_ajax/', headers=Headd, data=LoginData)
res = json.loads(AccLogin.text)
print(res)
this is the result:
{'message': 'checkpoint_required', 'checkpoint_url': 'https://www.instagram.com/challenge/AXHQIDuh8SBT-M1AVt14AvFB8HLczbgGoyMMvnp86BsPApnJhDJkWE04ZvwjjnczcaLk_g/Afxv1hZK6GoZ_gqxVubIGNLbEyMAAMo6gVAokxxs2ScpC72bLEz6kjkjmJPi33BZdcL-SZ8ZNpy9dw/?challenge_node_id=18315435868046003&challenge_context=%7B%22step_name%22:+%22%22,+%22nonce_code%22:+%22bpjtu8gd1a%22,+%22user_id%22:+%22AXGMD9Ch0rKgE6Zo5g91rV1qjm2JFFwxQC1axVNqoGW6heLiXhcW5lqRNcT3aP-73-y_7g%22,+%22cni%22:+%2218315435868046003%22,+%22is_stateless%22:+false,+%22present_as_modal%22:+false%7D', 'lock': False, 'flow_render_type': 0, 'status': 'fail'}
I am trying to do some scraping from websites using GET and POST methods, but now I am facing a new challenge.
I am trying to get data from a credit simulator, I found this portuguese site (https://www.bancomontepio.pt/particulares/credito/pessoal/credito-pessoal-online).
As far as I know, I need to use POST method, but I have to specify the data (the Amount value, the Term...). I usually do it by creating a dictionary structure but that is not working.
I'm kinda lost to be fair, maybe the problem is on the header...
Here is my code:
import requests
import warnings
warnings.filterwarnings("ignore")
term=24
amount=5000
url = 'https://simuladores.bancomontepio.pt/ITSCredit.External/Calculator/ITSCredit.Calculator.UI.External/gateway/Calculator/api/Calculator/Calculate?hash=-1359629931'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.122 Safari/537.36',
'Accept-Language': 'pt-PT,pt;q=0.9,en-US;q=0.8,en;q=0.7'}
payload = {'Amount': amount,'Term': term,'ProductCode':"26B1129900X"}
response = requests.post(url, headers=headers, data=payload, verify=False).json()
If i take off the .json(), I get the error Response [410].
The goal is to get the TAN or TAEG that change when term ("Prazo") or amount ("Montante") values change.
Any ideias?
[EDIT]
headers = {'Accept': 'application/json, text/plain, */*' ,
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'pt-PT,pt;q=0.9,en-US;q=0.8,en;q=0.7',
'Connection': 'keep-alive',
'Content-Length': '957',
'content-type': 'text/plain',
'Cookie': '_gcl_au=1.1.911606195.1646064658; OptanonAlertBoxClosed=2022-02-28T16:45:11.586Z; _ga=GA1.2.1147601977.1646064657; _ga_8WVEJF7X11=GS1.1.1646305654.3.1.1646309750.0; _ga_63QCVBV1V3=GS1.1.1646305679.1.1.1646309750.0; ASP.NET_SessionId=wlfbf2dx4oatlio0vl1ftinq; _gid=GA1.2.449121330.1646650093; calc-cookie=; OptanonConsent=isGpcEnabled=0&datestamp=Mon+Mar+07+2022+11%3A38%3A48+GMT%2B0000+(Hora+padr%C3%A3o+da+Europa+Ocidental)&version=6.30.0&isIABGlobal=false&consentId=6caccc97-6af1-4b55-9049-5694835d9f7a&interactionCount=2&landingPath=NotLandingPage&groups=C0001%3A1%2CC0002%3A1%2CC0003%3A1%2CC0004%3A1&hosts=H10%3A1%2CH20%3A1%2CH7%3A1%2CH8%3A1%2CH23%3A1%2CH11%3A1%2CH24%3A1%2CH13%3A1%2CH25%3A1&genVendors=&geolocation=ES%3B&AwaitingReconsent=false; _gali=slider-container; _gat_UA-186811106-6=1',
'Host': 'simuladores.bancomontepio.pt',
'Origin': 'https://simuladores.bancomontepio.pt',
'Referer': 'https://simuladores.bancomontepio.pt/',
'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="90", "Google Chrome";v="90"',
'sec-ch-ua-mobile': '?0',
'Sec-Fetch-Dest': 'empty',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-origin',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36'}headers = {'Accept': 'application/json, text/plain, */*' ,
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'pt-PT,pt;q=0.9,en-US;q=0.8,en;q=0.7',
'Connection': 'keep-alive',
'Content-Length': '957',
'content-type': 'text/plain',
'Cookie': '_gcl_au=1.1.911606195.1646064658; OptanonAlertBoxClosed=2022-02-28T16:45:11.586Z; _ga=GA1.2.1147601977.1646064657; _ga_8WVEJF7X11=GS1.1.1646305654.3.1.1646309750.0; _ga_63QCVBV1V3=GS1.1.1646305679.1.1.1646309750.0; ASP.NET_SessionId=wlfbf2dx4oatlio0vl1ftinq; _gid=GA1.2.449121330.1646650093; calc-cookie=; OptanonConsent=isGpcEnabled=0&datestamp=Mon+Mar+07+2022+11%3A38%3A48+GMT%2B0000+(Hora+padr%C3%A3o+da+Europa+Ocidental)&version=6.30.0&isIABGlobal=false&consentId=6caccc97-6af1-4b55-9049-5694835d9f7a&interactionCount=2&landingPath=NotLandingPage&groups=C0001%3A1%2CC0002%3A1%2CC0003%3A1%2CC0004%3A1&hosts=H10%3A1%2CH20%3A1%2CH7%3A1%2CH8%3A1%2CH23%3A1%2CH11%3A1%2CH24%3A1%2CH13%3A1%2CH25%3A1&genVendors=&geolocation=ES%3B&AwaitingReconsent=false; _gali=slider-container; _gat_UA-186811106-6=1',
'Host': 'simuladores.bancomontepio.pt',
'Origin': 'https://simuladores.bancomontepio.pt',
'Referer': 'https://simuladores.bancomontepio.pt/',
'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="90", "Google Chrome";v="90"',
'sec-ch-ua-mobile': '?0',
'Sec-Fetch-Dest': 'empty',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-origin',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36'}
payload = {'CCRDCalculateInput':{},'MultifunctionsCalculateInput':{},'Device':{'Browser':'chrome','BrowserVersion':'90.0.4430.212','Device':'Desktop','Os':'windows','OsVersion':'windows-10','UserAgent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36'},'IsCustomer':'true','Amount':7500,'Term':60,'ConditionCode':'26B1129900X-01-I-129900-F','CreditDestinationCode':'129900','ProductCode':'26B1129900X','FinancedExpenses':'false','FrequencyTrancheCode':'null','GoalCode':'C006','GoalDescription':'PROJETOS PESSOAIS','FrequencyTypeCode':'M','FamilyCode':'CP','Proponents':[{'Position':1,
'Birthday':'1992-03-07T13:03:30.000Z','State':'true','EntityType':{'ID':1,'CompanyID':1,'Code':'P','Description':'Proponente','Value':'null','ValueString':'null','State':'true','Imported':'null'},'ExpenseCodes':['009']}],'Counterparts':0,'OptionalExpenses':[{'Code':'009','Factor':1}],'ResidualValue':0,'InterestOnly':0,'Deferment':0}
Now I'm getting a empty json()... Response 200 but I got this structure:
{'Status': 'Unknown',
'Error': {'VisibleToHuman': False, 'Code': '0', 'Message': ''},
'Result': None}
As far as I know, the status should be "OK" to get some info on the Result.
Cheers
Looks like you need to expand the payload to include more (all) of the parameters (including the cookies, specifically the ASP.NET_SessionId).
import requests
import warnings
warnings.filterwarnings("ignore")
term=24
amount=5000
url = 'https://simuladores.bancomontepio.pt/ITSCredit.External/Calculator/ITSCredit.Calculator.UI.External/gateway/Calculator/api/Calculator/Calculate?hash=-1359629931'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.122 Safari/537.36',
'Accept-Language': 'pt-PT,pt;q=0.9,en-US;q=0.8,en;q=0.7',
'Cookie':'ASP.NET_SessionId=fhkyn1vn5knlw3uhdnh50nii;'}
payload = {
"CCRDCalculateInput":{},
"MultifunctionsCalculateInput":{},
"Device":{
"Browser":"chrome",
"BrowserVersion":"96.0.4664.110",
"Device":"Desktop",
"Os":"windows",
"OsVersion":"windows-10",
"UserAgent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36"},
"IsCustomer":'true',
"Amount":amount,
"Term":term,
"ConditionCode":"26B1129900X-01-I-129900-F",
"CreditDestinationCode":"129900",
"ProductCode":"26B1129900X",
"FinancedExpenses":'false',
"FrequencyTrancheCode":'null',
"GoalCode":"C006",
"GoalDescription":"PROJETOS PESSOAIS",
"FrequencyTypeCode":"M",
"FamilyCode":"CP",
"Proponents":[{
"Position":'1',
"Birthday":"1992-03-10T13:03:24.000Z",
"State":'true',
"EntityType":{
"ID":'1',
"CompanyID":'1',
"Code":"P",
"Description":"Proponente",
"Value":'null',
"ValueString":'null',
"State":'true',
"Imported":'null'},
"ExpenseCodes":[
"009"]}],
"Counterparts":'0',
"OptionalExpenses":[{
"Code":"009",
"Factor":'1'}],
"ResidualValue":'0',
"InterestOnly":'0',
"Deferment":'0'}
jsonData = requests.post(url, headers=headers, json=payload, verify=False).json()
results = jsonData['Result']
mtic = results['MTIC']
installment = results['PeriodInstallment'][0]['Installment']
taeg = results['TAEG']
tan = results['PeriodInstallment'][0]['TAN']
print(f'Installment: {installment}\nTAEG: {taeg}\nTAN: {tan}\nMTIC: {mtic}')
Output:
Installment: 224.5
TAEG: 14.8
TAN: 7.0
MTIC: 5708.2
I would like to get the json data from for instance https://app.weathercloud.net/d0838117883#current using python requests module.
I tried:
import re
import requests
device='0838117883'
URL='https://app.weathercloud.net'
URL1=URL+'/d'+device
URL2=URL+'/device/stats'
headers={'Content-Type':'text/plain; charset=UTF-8',
'Referer':URL1,
'User-Agent':'Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/48.0.2564.82 Chrome/48.0.2564.82 Safari/537.36',
'Accept':'application/json, text/javascript,*/*'}
with requests.Session() as s:
#get html from URL1 in order to get the CSRF token
page = s.get(URL1)
CSRF=re.findall('WEATHERCLOUD_CSRF_TOKEN:"(.*)"},',page.text)[0]
#create parameters for URL2, in order to get the json file
params={'code':device,'WEATHERCLOUD_CSRF_TOKEN':CSRF}
page_stats=requests.get(URL2,params=params,headers=headers)
print(page_stats.url)
print(page_stats) #<Response [200]>
print(page_stats.text) #empty
print(page_stats.json()) #error
But the page_stats is empty.
How can I get the stats data from weathercloud?
Inspecting the page with DevTools, you'll find a useful endpoint:
https://app.weathercloud.net/device/stats
You can "replicate" the original web request made by your browser with requests library:
import requests
cookies = {
'PHPSESSID': '************************',
'WEATHERCLOUD_CSRF_TOKEN':'***********************',
'_ga': '**********',
'_gid': '**********',
'__gads': 'ID=**********',
'WeathercloudCookieAgreed': 'true',
'_gat': '1',
'WEATHERCLOUD_RECENT_ED3C8': '*****************',
}
headers = {
'Connection': 'keep-alive',
'sec-ch-ua': '^\\^Google',
'Accept': 'application/json, text/javascript, */*; q=0.01',
'X-Requested-With': 'XMLHttpRequest',
'sec-ch-ua-mobile': '?0',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36',
'sec-ch-ua-platform': '^\\^Windows^\\^',
'Sec-Fetch-Site': 'same-origin',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Dest': 'empty',
'Referer': 'https://app.weathercloud.net/d0838117883',
'Accept-Language': 'it-IT,it;q=0.9,en-US;q=0.8,en;q=0.7,es;q=0.6',
}
params = (
('code', '0838117883'),
('WEATHERCLOUD_CSRF_TOKEN', '****************'),
)
response = requests.get('https://app.weathercloud.net/device/stats', headers=headers, params=params, cookies=cookies)
# Serializing json
json_object = json.loads(response.text)
json Output:
{'last_update': 1632842172,
'bar_current': [1632842172, 1006.2],
'bar_day_max': [1632794772, 1013.4],
'bar_day_min': [1632845772, 1006.2],
'bar_month_max': [1632220572, 1028],
'bar_month_min': [1632715572, 997.3],
'bar_year_max': [1614418512, 1038.1],
'bar_year_min': [1615434432, 988.1],
'wdir_current': [1632842172, 180],
..............}
That's it.
I am trying to scrape "shopee.com.my" top selling products with scrape and also tried with requests but failed in getting valid JSON object. my requests code is given below:
import requests as r
import json
data = {
'authority': 'shopee.com.my',
'method': 'GET',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'en-US,en;q=0.9',
'scheme': 'https',
'accept': '*/*, application/json',
'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36',
'x-api-source': 'pc',
'x-requested-with': 'XMLHttpRequest',
'x-shopee-language': 'en',
'sec-fetch-dest': 'empty',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-origin',
}
subcat_url = '/Boys-Fashion-cat.27.2427'
id = subcat_url.split('.')[-1]
data['path'] = f'/api/v2/search_items/?by=sales&limit=50&match_id={id}&newest=0&order=desc&page_type=search&version=2'
data['referer'] = f'https://shopee.com.my{subcat_url}?page=0&sortBy=sales'
url = f'https://shopee.com.my/api/v2/search_items/?by=sales&match_id={id}&newest=0&order=desc&page_type=search&version=2'
req = r.get(url, headers=data)
items = req.json()['items']
print(items)
print(f'Items length: {len(items)}')
here is my scrapy code:
import scrapy
import json
from scrapy import Request
from scrapy.http.cookies import CookieJar
header_data = {'authority': 'shopee.com.my',
'method': 'GET',
'scheme': 'https',
'accept': '*/*',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'en-US,en;q=0.9',
'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36',
# 'cookie': 'SPC_U=-; SPC_IA=-1; SPC_EC=-; SPC_F=7jrWAm4XYNNtyVAk83GPknN8NbCMQEIk; REC_T_ID=476673f8-eeb0-11ea-8919-48df374df85c; _gcl_au=1.1.1197882328.1599225148; _med=refer; _fbp=fb.2.1599225150134.114138691; language=en; _ga=GA1.3.1167355736.1599225151; csrftoken=mu9M72KLd73P9QJusB9zFBP6wV3NGg85; _gid=GA1.3.273342972.1603211749; SPC_SI=yxvc89nmqe97ldvpo6wgeybtc8berzyd; welcomePkgShown=true; AMP_TOKEN=%24NOT_FOUND; REC_MD_41_1000027=1603289427_0_50_0_48; SPC_CT_48918e31="1603289273.lUS7x9IuKN5vNbhzibZCOHrIf6vVQmykU/TXxiOii7w="; SPC_CT_57540430="1603289278.FLT3IdzHC32RmEzFxkOi9pI7qhKIs/yq328elYMuwps="; SPC_CT_50ee4e78="1603289299.gvjW32HwgiQGN/4kj2Ac3YFrpqyHVTO8+UjM+uzxy4E="; _dc_gtm_UA-61915055-6=1; SPC_CT_75d7a2b7="1603289557.t5FvxXhnJacZrKkjnIWCUbAgAxAQ3hG5c1tZBzafwc4="; SPC_R_T_ID="n6Ek85JJY1JZATlhgutfB4KB3qrbmFDYX1+udv1EBAPegPE9xuzM8HFeCy1duskY9+DVLJxe4RqaabhyUuojHQG0NI2TqegihbAge+s3k7w="; SPC_T_IV="SGNXqyZ1jtRYpo5kFeKtYg=="; SPC_R_T_IV="SGNXqyZ1jtRYpo5kFeKtYg=="; SPC_T_ID="n6Ek85JJY1JZATlhgutfB4KB3qrbmFDYX1+udv1EBAPegPE9xuzM8HFeCy1duskY9+DVLJxe4RqaabhyUuojHQG0NI2TqegihbAge+s3k7w="',
'sec-fetch-dest': 'empty',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-origin',
'x-api-source': 'pc',
'x-requested-with': 'XMLHttpRequest',
'x-shopee-language': 'en',
}
class TestSpider(scrapy.Spider):
name = 'test'
allowed_domains = ['shopee.com', 'shopee.com.my', 'shopee.com.my/api/']
def start_requests(self):
subcat_url = '/Baby-Toddler-Play-cat.27.23785'
id = subcat_url.split('.')[-1]
header_data['path'] = f'/api/v2/search_items/?by=sales&limit=50&match_id={id}&newest=0&order=desc&page_type=search&version=2'
header_data['referer'] = f'https://shopee.com.my{subcat_url}?page=0&sortBy=sales'
url = f'https://shopee.com.my/api/v2/search_items/?by=sales&limit=50&match_id={id}&newest=0&order=desc&page_type=search&version=2'
yield Request(url=url, headers=header_data)
def parse_data(self, response):
try:
jdata = json.loads(response.body)
return None
except Exception as e:
print(f'exception: {e}')
print(response.body)
return None
items = jdata['items']
for item in items:
name = item['name']
image_path = item['image']
absolute_image = f'https://cf.shopee.com.my/file/{image_path}_tn'
print(f'this is absolute image {absolute_image}')
monthly_sold = 'pending'
price = float(item['price'])/100000
total_sold = item['sold']
location = item['shop_location']
stock = item['stock']
print(name)
print(price)
print(total_sold)
print(location)
print(stock)
not using cookies now but also tried with fresh cookies but no response.
Here are some example links where some so them responses always valid JSON object but some links not return any response. see below api and direct browser links:
https://shopee.com.my/Kids-Sports-Outdoor-Play-cat.27.21700?page=0&sortBy=sales
https://shopee.com.my/api/v2/search_items/?by=sales&limit=50&match_id=21700&newest=0&order=desc&page_type=search&version=2
https://shopee.com.my/Bath-Toiletries-cat.27.2422
https://shopee.com.my/api/v2/search_items/?by=sales&limit=50&match_id=2422&newest=0&order=desc&page_type=search&version=2
you can also see API links in network tab:
network tab link image
I think you are missing a required header I send them like this and it worked
from pprint import pprint
import requests
headers = {
'authority': 'shopee.com.my',
'pragma': 'no-cache',
'cache-control': 'no-cache',
'x-shopee-language': 'en',
'x-requested-with': 'XMLHttpRequest',
'if-none-match-': '55b03-c3d70d78b473147beeb6551fa9df8ca0',
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36',
'x-api-source': 'pc',
'accept': '*/*',
'sec-fetch-site': 'same-origin',
'sec-fetch-mode': 'cors',
'sec-fetch-dest': 'empty',
'referer': 'https://shopee.com.my/Kids-Sports-Outdoor-Play-cat.27.21700?page=0&sortBy=sales',
'accept-language': 'es-US,es;q=0.9,en-US;q=0.8,en;q=0.7,es-419;q=0.6',
# 'cookie': '_gcl_au=1.1.1866522785.1603486253; _fbp=fb.2.1603486253254.1114160447; SPC_IA=-1; SPC_EC=-; SPC_U=-; SPC_F=9RO26eJM7IQiFlxki0dAdQCcCsgPwz67; REC_T_ID=71a698d6-1571-11eb-9baf-48df3757c438; SPC_SI=mall.n58BgakbNjCD5RDYlsQJ8EurmBkH5HIY; SPC_CT_c49f0fdc="1603486254.GqWz1BPlfz3MKmUufL3eTwFqgUfdKWcWVf2xiJI7nSk="; SPC_R_T_ID="89vber/2TKnfACAmGbXpxC3BzHc0ajEQMPxgMbAlZnQlgEo7YWmya0sf/KRt1FsoZvaFYKoNDk+Rh9YWLWsNMH324iqgZePbam1q9QpYQlE="; SPC_T_IV="vko6vAtWsyHuqteFHAoPIA=="; SPC_R_T_IV="vko6vAtWsyHuqteFHAoPIA=="; SPC_T_ID="89vber/2TKnfACAmGbXpxC3BzHc0ajEQMPxgMbAlZnQlgEo7YWmya0sf/KRt1FsoZvaFYKoNDk+Rh9YWLWsNMH324iqgZePbam1q9QpYQlE="; AMP_TOKEN=%24NOT_FOUND; _ga=GA1.3.602723004.1603486255; _gid=GA1.3.657631736.1603486255; _dc_gtm_UA-61915055-6=1; language=en',
}
params = (
('by', 'sales'),
('limit', '50'),
('match_id', '21700'),
('newest', '0'),
('order', 'desc'),
('page_type', 'search'),
('version', '2'),
)
response = requests.get('https://shopee.com.my/api/v2/search_items/', headers=headers, params=params)
pprint(response.json())