Before I start let me point out that I have almost no clue wtf I'm doing. Like imagine a cat that tries to do some coding. I try to write some Python code using Pycharm on Ubuntu 22.04.1 LTS and also used Insomnia if this makes any difference. Here is the code:
`
# sad_scrape_code_attempt.py
import time
import httpx
from playwright.sync_api import sync_playwright
HEADERS = {
"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:106.0) Gecko/20100101 Firefox/106.0",
"Accept": "*/*",
"Accept-Language": "en-US,en;q=0.5",
"Accept-Encoding": "gzip, deflate, br",
"Referer": "https://shop.metro.bg/shop/cart",
"CallTreeId": "||BTOC-1BF47A0C-CCDD-47BB-A9DA-592009B5FB38",
"Content-Type": "application/json; charset=UTF-8",
"x-timeout-ms": "5000",
"DNT": "1",
"Connection": "keep-alive",
"Sec-Fetch-Dest": "empty",
"Sec-Fetch-Mode": "cors",
"Sec-Fetch-Site": "same-origin"
}
def get_cookie_playwright():
with sync_playwright() as p:
browser = p.firefox.launch(headless=False, slow_mo=50)
context = browser.new_context()
page = context.new_page()
page.goto('https://shop.metro.bg/shop/cart')
page.fill('input#user_id', 'the_sad_cat_username')
page.fill('input#password', 'the_sad_cat_password')
page.click('button[type=submit]')
page.click('button.btn-primary.accept-btn.field-accept-button-name')
page.evaluate(
"""
var intervalID = setInterval(function () {
var scrollingElement = (document.scrollingElement || document.body);
scrollingElement.scrollTop = scrollingElement.scrollHeight;
}, 200);
"""
)
prev_height = None
while True:
curr_height = page.evaluate('(window.innerHeight + window.scrollY)')
if not prev_height:
prev_height = curr_height
time.sleep(1)
elif prev_height == curr_height:
page.evaluate('clearInterval(intervalID)')
break
else:
prev_height = curr_height
time.sleep(1)
# print(context.cookies())
cookie_for_requests = context.cookies()[11]['value']
browser.close()
return cookie_for_requests
def req_with_cookie(cookie_for_requests):
cookies = dict(
Cookie=f'BIGipServerbetty.metrosystems.net-80={cookie_for_requests};')
r = httpx.get('https://shop.metro.bg/ordercapture.customercart.v1/carts/alias/current', cookies=cookies)
return r.text
if __name__ == '__main__':
data = req_with_cookie(get_cookie_playwright())
print(data)
# Used packages
#Playwright
#PyTest
#PyTest-Playwirght
#JavaScript
#TypeScript
#httpx
`
so basically I copy paste the code of 2 tutorials made by John Watson Rooney called:
The Biggest Mistake Beginners Make When Web Scraping
Login and Scrape Data with Playwright and Python
Than combined them and added some JavaScript to scroll to the bottom of the page. Than I found an article called: How Headers Are Used to Block Web Scrapers and How to Fix It
thus replacing "import requests" with "import httpx" and added the HEADERS as per given from Insomnia. From what I understand browsers return headers in certain order and this is an often overlooked web scraper identification method. Primarily because many http clients in various programming languages implement their own header ordering - making identification of web scrapers very easy! If this is true I need to figure out a way to return my cookies header following the correct order, which by the way I have no clue how to figure out but I believe its #11 or #3 judging by the code generated by Insomnia:
`
import requests
url = "https://shop.metro.bg/ordercapture.customercart.v1/carts/alias/current"
querystring = {"customerId":"1001100022726355","cardholderNumber":"1","storeId":"00022","country":"BG","locale":"bg-BG","fsdAddressId":"1001100022726355994-AD0532EI","__t":"1668082324830"}
headers = {
"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:106.0) Gecko/20100101 Firefox/106.0",
"Accept": "*/*",
"Accept-Language": "en-US,en;q=0.5",
"Accept-Encoding": "gzip, deflate, br",
"Referer": "https://shop.metro.bg/shop/cart",
"CallTreeId": "||BTOC-1BF47A0C-CCDD-47BB-A9DA-592009B5FB38",
"Content-Type": "application/json; charset=UTF-8",
"x-timeout-ms": "5000",
"DNT": "1",
"Connection": "keep-alive",
"Cookie": "selectedLocale_BG=bg-BG; BIGipServerbetty.metrosystems.net-80=!DHrH53oKfz3YHEsEdKzHuTxiWd+ak6uA3C+dv7oHRDuEk+ScE0MCf7DPAzLTCmE+GApsIOFM2GKufYk=; anonymousUserId=24EE2F84-55B5-4F94-861E-33C4EB770DC6; idamUserIdToken=eyJhbGciOiJSUzI1NiIsInR5cCI6IkpXVCIsImtpZCI6IktfYWE1NTAxNWEtMjA2YS0xMWVkLTk4ZDUtZTJjYzEyYjBkYzUwIn0.eyJleHAiOjE2NjgwODQxMjIsImlhdCI6MTY2ODA4MjMyMiwiYXVkIjoiQlRFWCIsInNlc3Npb25fc3RhdGUiOiJPTnJweFVhOG12WHRJeDR0c3pIZ09GR296WHUyeHZVVzVvNnc3eW1lLUdZLnJMRU1EWGFGIiwiaXNzIjoiaHR0cHM6Ly9pZGFtLm1ldHJvLmJnIiwiZW1haWwiOiJvZmZpY2VAdGVydmlvbi5iZyIsIm5vbmNlIjoiZjg3ZDMyYzEyYTRkNDY1ZGEzYjQwMTQ3OTlkYzc4NzMiLCJjX2hhc2giOiIiLCJzdWIiOiJVX2Y0MjBhY2E4LWY2OTMtNGMxNS1iOTIzLTc1NWY5NTc3ZTIwMCIsImF0X2hhc2giOiJlbkFGRFNJdUdmV0wzNnZ0UnJEQ253IiwicmVhbG0iOiJTU09fQ1VTVF9CRyIsImF1dGhfdGltZSI6MTY2ODA4MjMyMiwiYW1yIjpbIlVTRVJfQ1JFREVOVElBTFMiXX0.AC9vccz5PBe0d2uD6tHV5KdQ8_zbZvdARGUqo5s8KpJ0bGw97vm3xadF5TTHBUwkXX3oyJsbygC1tKvQInycU-zE0sqycIDtjP_hAGf6tUG-VV5xvtRsxBkacTBMy8OmbNHi5oncko7-dZ_tSOzQwSclLZKgKaqBcCqPBQVF0ug4pvbbqyZcw6D-MH6_T5prF7ppyqY11w9Ps_c7pFCciFR965gsO3Q-zr8CjKq1qGJeEpBFMKF0vfwinrc4wDpC5zd0Vgyf4ophzo6JkzA8TiWOGou5Z0khIpl435qUzxzt-WPFwPsPefhg_X9fYHma_OqQIpNjnV2tQwHqBD1qMTGXijtfOFQ; USER_TYPE=CUST; compressedJWT=eNpVUtlyozAQ/CJvcdrhEZtLGGGbQ4BeUlwGiTMhMeCvX5Fkt3YfVKrqme6eaalc7Tozc3Ihth8+Ae8SW/lVrvaYi3AD7Vx0eRwD4pzsuohuG3YsIm/EkUyxDybQjVzqgz2gqnBrj0ZpthNEtzUNqjWl3uqb4xkxA8Z/FhHY+ATHHld+cdFnYbZcZqIPpsflK9PpsBbw4LvfVFYcsh6LzdLJfGbOE+hR8B9ObOmG4FTqLgz4InCs+hhw81Q0BnQsHIQGmBLe3TR/7nzC7fHqmBh6uuIDMpMCuVwm2u2Xf2NbngbWDc9NQ85MpcYnhvcfOejtB5s1B3TMQefyueg9sgit8QlM8cnmc1P+rlF9hpq+QE2dIQUipMnTDRiPLBuvzjtvyISlwbF9KSKe5WH/8Izvnt5rE6FGuYDWsFMmjOa/+zMfLmWegYkEHC0/PO+P9qPYcuzbb5ztwvqVr1061LHzTHX8yDu33XbCnTHlQsgydcesK5iPO2JBvmbk3xpmH6RtNt00YnNQXXBpNV+0UIYU8lCD2ztKOdODQSJcNFVyg2aF60zS2GVvjvQk9lpAh8WliQS1aoVPwPJQn/fbr0vdxRiDJLh7d8pJhzVeNIW+75QK7H0zFVp9Z3BeGmZlA17s5LAcHDgjmc8vO/QiqorcSOenYVEx0/HJATQIqDJxAS7qsKnGQqrrXf5qNaf9GyRl3emruki8vxg0It5IhsxSfI8lGkvl+72qsoNMjhUp75xzR7NRq83w0Pp6oRqg74eq65zPaD/H9TX6GIyDfmFccfA8/fVtkPe7y5AUosA+fpZWBO0l9QzSZIfuoeG2n8aJNKG0WMfoap2XOcVJKT0ex9ep0m9vZv0gJwkqKue+Xb0TZ0Bjz+HMqi9W6Z81h+8PCaRZTJtoFYOun46FkQiPyFmGF65/VX33RdKl+ZYcXDvs7/Nv6PdLkg==; SES2_customerAdr_1001100022726355={%22addressId%22:%221001100022726355994-AD0532EI%22%2C%22addressHash%22:%221001100022726355994-AD0532EI%22%2C%22storeId%22:%2200022%22}; SES2_customerAdr_={%22addressId%22:null%2C%22addressHash%22:null%2C%22storeId%22:%2200022%22}; UserSettings=SelectedStore=1b1fc6ac-2ad6-4243-806e-a4a28c96dff4&SelectedAddress=1001100022726355994-ad0532ei",
"Sec-Fetch-Dest": "empty",
"Sec-Fetch-Mode": "cors",
"Sec-Fetch-Site": "same-origin"
}
response = requests.request("GET", url, headers=headers, params=querystring)
print(response.text)
`
So I'm stuck. Any help or ideas will be greatly appreciated.
The page you're navigating to shows this on a GET request:
HTTP ERROR 401 You must provide a http header 'JWT'
This means that this page requires a level of authorization to be accessed.
See JWTs.
"Authorization: This is the most common scenario for using JWT. Once the user is logged in, each subsequent request will include the JWT, allowing the user to access routes, services, and resources that are permitted with that token."
You can access the root page just fine, but once you navigate to more user specific pages or "routes", you will need to provide a JWT to access that page's content.
There is a way to get past this using scraping. You will have to log in to the site as a user using your scraper and collect the JWT which is created by the server and sent back to your client. Then use that JWT in your request's headers:
token = "randomjwtgibberishshahdahdahwdwa"
HEADERS = {
"Authorization": "Bearer " + token
}
Related
So, basically, it seems that requests.get(url) can't complete with Tiktok user profiles url:
import requests
url = "http://tiktok.com/#malopedia"
rep = requests.get(url) #<= will never complete
As I don't get any error message, I have no idea what's going on. Why is it not completing? How do I get it to complete?
TikTok is quite strict when it comes to automated connections so you need to provide headers in your request, like this:
import requests
url = "http://tiktok.com/#malopedia"
rep = requests.get(
url,
headers={
"Accept": "*/*",
"Accept-Encoding": "identity;q=1, *;q=0",
"Accept-Language": "en-US;en;q=0.9",
"Cache-Control": "no-cache",
"Connection": "keep-alive",
"Pragma": "no-cache",
"User-Agent": "Mozilla/5.0",
},
)
print(rep)
This should respond with 200.
However, if you plan on doing some heavy lifting with your code, consider using one of the unofficial API wrappers, like this one.
I am trying to scrape this website items, however when I used httpx or even requests sometimes it passes and gets the response sometimes it doesn't. It seems random, that's why I tried doing a rerun of the failed items to get the results. However this does not seem to work 100% of all the time. Is there something that I am not doing right? Can somebody help?
Below is a sample of the web I am scraping, if it does not show any errors you can loop it to scrape the same thing twice.
Here's my current code:
urllist=['https://www.blibli.com/backend/product-detail/products/ps--RAM-70107-16912/_summary?defaultItemSku=RAM-70107-16912-00001&pickupPointCode=PP-3239816&cnc=false',
'https://www.blibli.com/backend/product-detail/products/ps--RAM-70107-04124/_summary?defaultItemSku=RAM-70107-04124-00001&pickupPointCode=PP-3239816&cnc=false',
'https://www.blibli.com/backend/product-detail/products/ps--RAM-70107-19598/_summary?defaultItemSku=RAM-70107-19598-00001&pickupPointCode=PP-3239816&cnc=false',
'https://www.blibli.com/backend/product-detail/products/ps--RAM-70107-01115/_summary?defaultItemSku=RAM-70107-01115-00001&pickupPointCode=PP-3206709&cnc=false',
'https://www.blibli.com/backend/product-detail/products/ps--RAM-70107-02620/_summary?defaultItemSku=RAM-70107-02620-00001&pickupPointCode=PP-3239816&cnc=false',
'https://www.blibli.com/backend/product-detail/products/ps--RAM-70107-17552/_summary?defaultItemSku=RAM-70107-17552-00001&pickupPointCode=PP-3023611&cnc=false',
'https://www.blibli.com/backend/product-detail/products/ps--RAM-70107-08377/_summary?defaultItemSku=RAM-70107-08377-00001&pickupPointCode=PP-3239816&cnc=false',
'https://www.blibli.com/backend/product-detail/products/ps--RAM-70107-03274/_summary?defaultItemSku=RAM-70107-03274-00001&pickupPointCode=PP-3069769&cnc=false',
'https://www.blibli.com/backend/product-detail/products/ps--RAM-70107-04345/_summary?defaultItemSku=RAM-70107-04345-00001&pickupPointCode=PP-3239816&cnc=false',
'https://www.blibli.com/backend/product-detail/products/ps--RAM-70107-01030/_summary?defaultItemSku=RAM-70107-01030-00001&pickupPointCode=PP-3239816&cnc=false',
'https://www.blibli.com/backend/product-detail/products/ps--RAM-70107-01534/_summary?defaultItemSku=RAM-70107-01534-00001&pickupPointCode=PP-3239816&cnc=false']
rancherrors=pd.DataFrame()
ranchdf=pd.DataFrame()
for url in urllist:
try:
ua = UserAgent()
USER_AGENT = ua.random
headers={
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"accept-encoding": "gzip, deflate, br",
"accept-language": "id-ID,id;q=0.9,en-US;q=0.8,en;q=0.7",
"cache-control": "max-age=0",
"sec-ch-ua": 'Chromium";v="106", "Google Chrome";v="106", "Not;A=Brand";v="99',
"sec-ch-ua-platform": "Windows",
"sec-fetch-dest": "document",
"sec-fetch-mode": "navigate",
"sec-fetch-site": "none",
'X-Forwarded-For': f'{random_ip}',
"user-agent" : f"{USER_AGENT}",
'referer':'https://www.blibli.com/'
}
response =httpx.get(url,headers=headers)
try:
price=str(response.json()['data']['price']['listed']).replace(".0","")
discount=str(response.json()['data']['price']['totalDiscount'])
except:
price="0"
discount="0"
try:
unit=str(response.json()['data']['uniqueSellingPoint']).replace("• ","")
except:
unit=""
dat={
'product_name':response.json()['data']['name'],
'normal_price':price,
'discount':discount,
'competitor_id':response.json()['data']['ean'],
'url':input,
'unit':unit,
'astro_id':id,
'date_key':today,
'web':'ranch market'
}
dat=pd.DataFrame([dat])
ranchdf=ranchdf.append(dat)
sleep(randint(2,5))
except Exception as e:
datum={
'id':id,
'url':url
}
datum=pd.DataFrame([datum])
rancherrors=rancherrors.append(datum)
print(f'{url} error {e}')
When I run the below code in a python script, I'm getting the error message <Response [403]>
But when I send a get request via postman or open up the address in my browser, I'm not getting an error message and it's showing a response status of <200>
I've copied all the headers that are showing up in the network console of Google Chrome to see if perhaps the issue was tied to me not importing all the proper headers, but the issue persists even if after copying every single header.
The code that runs my API
app = FastAPI()
#app.get("/test")
def pass_parameters(parameters: Optional[str] = Query(None)):
file_name = "hey.txt"
with open(file_name, 'w') as f:
api_token = "shdsajkhdjkasjdkasjh3242341jh"
parameters= {"phrase": parameters}
print(parameters)
response = requests.get(f"https://api.state.taxes", headers={ "Accept": "application/json", "Authorization": f"Bearer {api_token}"}, params=parameters)
f.write(json.dumps(response.json(), indent=4))
response = response.json()
return JSONResponse(content=response, media_type="text/json")
if __name__ == '__main__':
uvicorn.run(app, port=8080, host='127.0.0.1')
The code to get stuff from my API
import requests
url = "http://127.0.0.1:8080/test"
headers = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "en-US,en;q=0.9",
"Connection": "keep-alive",
"Cookie": "AMCV_1B3AA45570643167F000101%40AdobeOrg=-%7CMCIDTS%7C18944%7CMCMID%%7CMCOPTOUT-1636735667s%7CNONE%7CvVersion%7C5.1.1",
"Host": "127.0.0.1:8080",
"If-Modified-Since": "Sat, 19 Mar 2022 15:13:13 GMT",
"If-None-Match": "e6ead9f56bc933ab83465",
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": "Windows",
"Sec-Fetch-Dest": "document",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-Site": "none",
"Sec-Fetch-User": "?1",
"Upgrade-Insecure-Requests": "1",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.74 Safari/537.36"
}
response = requests.get(url, headers=headers)
print(response) --> returns <Response [403]>
But when I open my browser, the webpage opens up fine with a 200 response in the network console of the developer tools as well as within Postman. I exported the Postman call to Python and getting the same error. I saw several Stackoverflow posts were they were able to fix the issue by applying the correct header and disabling proxy in Postman, I've done both and issue persists. I'm not sure if perhaps there's a header that I'm missing?
Making a call to localhost instead of my local IP address seemed to have fixed the issue. Like so:
http://localhost:8080
I'm trying to implement the Yandex OCR translator tool into my code. With the help of Burp Suite, I managed to find that the following request is the one that is used to send the image:
I'm trying to emulate this request with the following code:
import requests
from requests_toolbelt import MultipartEncoder
files={
'file':("blob",open("image_path", 'rb'),"image/jpeg")
}
#(<filename>, <file object>, <content type>, <per-part headers>)
burp0_url = "https://translate.yandex.net:443/ocr/v1.1/recognize?srv=tr-image&sid=9b58493f.5c781bd4.7215c0a0&lang=en%2Cru"
m = MultipartEncoder(files, boundary='-----------------------------7652580604126525371226493196')
burp0_headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:65.0) Gecko/20100101 Firefox/65.0", "Accept": "*/*", "Accept-Language": "en-US,en;q=0.5", "Accept-Encoding": "gzip, deflate", "Referer": "https://translate.yandex.com/", "Content-Type": "multipart/form-data; boundary=-----------------------------7652580604126525371226493196", "Origin": "https://translate.yandex.com", "DNT": "1", "Connection": "close"}
print(requests.post(burp0_url, headers=burp0_headers, files=m.to_string()).text)
though sadly it yields the following output:
{"error":"BadArgument","description":"Bad argument: file"}
Does anyone know how this could be solved?
Many thanks in advance!
You are passing the MultipartEncoder.to_string() result to the files parameter. You are now asking requests to encode the result of the multipart encoder to a multipart component. That's one time too many.
You don't need to replicate every byte here, just post the file, and perhaps set the user agent, referer, and origin:
files = {
'file': ("blob", open("image_path", 'rb'), "image/jpeg")
}
url = "https://translate.yandex.net:443/ocr/v1.1/recognize?srv=tr-image&sid=9b58493f.5c781bd4.7215c0a0&lang=en%2Cru"
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:65.0) Gecko/20100101 Firefox/65.0",
"Referer": "https://translate.yandex.com/",
"Origin": "https://translate.yandex.com",
}
response = requests.post(url, headers=headers, files=files)
print(response.status)
print(response.json())
The Connection header is best left to requests, it can control when a connection should be kept alive just fine. The Accept* headers are there to tell the server what your client can handle, and requests sets those automatically too.
I get a 200 OK response with that code:
200
{'data': {'blocks': []}, 'status': 'success'}
However, if you don't set additional headers (remove the headers=headers argument), the request also works, so Yandex doesn't appear to be filtering for robots here.
I'm attempting to post the data of a pop-out form to a local web site. To do this I'm emulating the requests header and data and cookie information provided by the site. (Note: I am largely redacting my email and password from the code (for obvious reasons), but all other code will remain the same.)
I have tried mulitple permutations of the cookie, header, requests, data, etc. Additionally, I have verified in a network inspector the cookie and expected headers and data. I am able to easily set a cookie using requests' sample code. I cannot explain why my code won't work on a live site, and I'd be very grateful for any assistance. Please see the following code for further details.
import requests
import robobrowser
import json
br = robobrowser.RoboBrowser(user_agent="Windows Chrome",history=True)
url = "http://posting.cityweekly.net/gyrobase/API/Login/CookieV2"
data ={"passwordChallengeResponse":"....._SYGwbDLkSyU5gYKGg",
"email": "<email>%40bu.edu",
"ttl":"129600",
"sessionOnly": "1"
}
headers = {
"Origin": "http://posting.cityweekly.net",
"Accept-Encoding": "gzip, deflate",
"Accept-Language": "en-US,en;q=0.8,ru;q=0.6",
"User-Agent": "Windows Chrome", #"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.65 Safari/537.36",
"Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
"Referer": "http://posting.cityweekly.net/utah/Events/AddEvent",
"X-Requested-With": "XMLHttpRequest",
"Connection": "keep-alive",
"Cache-Control": "max-age=0",
"Host":"posting.cityweekly.net"
}
cookie = {"Cookie": "__utma=25975215.1299783561.1416894918.1416894918.1416897574.2; __utmc=25975215; __utmz=25975215.1416894918.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __qca=P0-2083194243-1416894918675; __gads=ID=e3b24038c9228b00:T=1416894918:S=ALNI_MY7ewizuxK0oISnqPJWlLDAeKFMmw; _cb_ls=1; _chartbeat2=D6vh2H_ZbNJDycc-t.1416894962025.1416897589974.1; __utmb=25975215.3.10.1416897574; __utmt=1"}
r = br.session.get(url, data=json.dumps(data), cookies=cookie, headers=headers)
print r.headers
print [item for item in r.cookies.__dict__.items()]
Note that I print the cookies object and that the cookies attribute (a dictionary) is empty.
You need to perform a POST to login to the site. Once you do that, I believe the cookies will then have the correct values, (not 100% on that...). This post clarifies how to properly set cookies.
Note: I don't think you need to do the additional import of requests unless you're using it outside of RoboBrowser.