Python web-scraping yields different html than browser after log-in page - python

Currently doing web-scraping for the first time trying to grab and compile a list of completed Katas from my CodeWars profile. You can view the completed problems without being logged in but it does not display your solutions unless you have logged in to that specific account.
Here is an inspect preview of the page display when logged in and the relevant divs I'm trying to scrape:
The url for that page is https://www.codewars.com/users/User_Name/completed_solutions
with User_Name replaced by an actual username.
The log-in page is: https://www.codewars.com/users/sign_in
I have attempted to get the divs with the class "list-item solutions" in two different ways now which I'll write:
#attempt 1
import requests
from bs4 import BeautifulSoup
login_url = "https://www.codewars.com/users/sign_in"
end_url = "https://www.codewars.com/users/Ash-Ozen/completed_solutions"
with requests.session() as sesh:
result = sesh.get(login_url)
soup = BeautifulSoup(result.content, "html.parser")
token = soup.find("input", {"name": "authenticity_token"})["value"]
payload = {
"user[email]": "ph#gmail.com",
"user[password]": "phpass>",
"authenticity_token": str(token),
}
result = sesh.post(login_url, data=payload) #this logs me in?
page = sesh.get(end_url) #This navigates me to the target page?
soup = BeautifulSoup(page.content, "html.parser")
print(soup.prettify()) # some debugging
# Examining the print statement shows that the "list-item solutions" is not
# there. Checking page.url shows the correct url(https://www.codewars.com/users/Ash-Ozen/completed_solutions).
solutions = soup.findAll("div", class_="list-item solutions")
# solutions yields an empty list.
and
#attempt 2
from robobrowser import RoboBrowser
from bs4 import BeautifulSoup
browser = RoboBrowser(history=True)
browser.open("https://www.codewars.com/users/sign_in")
form = browser.get_form()
form["user[email]"].value = "phmail#gmail.com"
form["user[password]"].value = "phpass"
browser.submit_form(form) #think robobrowser handles the crfs token for me?
browser.open("https://www.codewars.com/users/Ash-Ozen/completed_solutions")
r = browser.parsed()
soup = BeautifulSoup(str(r[0]), "html.parser")
solutions = soup.find_all("div", class_="list-item solutions")
print(solutions) # returns empty list
No idea how/what to debug from here to get it working.
Edit: My initial thoughts about what is going wrong is that, after performing either post I get redirected to the dashboard (behavior after logging in successfully) but it seems that when trying to get the final url I end up with the non-logged-in version of the page.

Related

Python Requests Post login

Yes, I know I'm green. I'm trying to learn how to POST into websites though I cannot seem pic the right fields to pass into the POST request.
Below you'll see the HTML for the site that I'm trying to grab everything from:
HTML picture
I've tried the following code to log into the website for Greetly but been having a hell of a time. I'm sure the values I'm passing must have the wrong keys but I can't seem to figure out what it is that I'm doing wrong.
import requests
from bs4 import BeautifulSoup
url = 'https://app.greetly.com'
urlVisitorLog = 'https://app.greetly.com/locations/00001/check_in_records'
values = {
'user[email]':'email',
'user[password]':'password'
}
c = requests.Session()
results = c.get(url)
soup = BeautifulSoup(results.content, 'html.parser')
key = soup.find(name="authenticity_token")
authenticity_token = key['value']
values["authenticity_token"] = authenticity_token
c.post(urlVisitorLog, headers= values)
r = c.get(urlVisitorLog)
soup2 = BeautifulSoup(r.content, 'html.parser')
Also once I get the username and password I noticed the authenticity token isn't bound to a specific id but I also kind of need that login in order to parse through and see where that is.

Scraping Bandcamp fan collections via POST

I've been trying to scrape Bandcamp fan pages to get a list of the albums they have purchased and I'm having trouble efficiently doing it. I wrote something with Selenium but it's mildly slow so I'd like to learn a solution that'd maybe send a POST request to the site and parse the JSON from there.
Here's a sample collection page: https://bandcamp.com/nhoward
Here's the Selenium code:
def scrapeFanCollection(url):
browser = getBrowser()
setattr(threadLocal, 'browser', browser)
#Go to url
browser.get(url)
try:
#Click show more button
browser.find_element_by_class_name('show-more').click()
#Wait two seconds
time.sleep(2)
#Scroll to the bottom loading full collection
scroll(browser, 2)
except Exception:
pass
#Return full album collection
soup_a = BeautifulSoup(browser.page_source, 'lxml', parse_only=SoupStrainer('a', {"class": "item-link"}))
#Empty array
urls = []
# Looping through all the a elements in the page source
for item in soup_a.find_all('a', {"class": "item-link"}):
url = item.get('href')
if(url != None):
urls.append(url)
return urls
The API can be accessed as follows:
$ curl -X POST -H "Content-Type: Application/JSON" -d \
'{"fan_id":82985,"older_than_token":"1586531374:1498564527:a::","count":10000}' \
https://bandcamp.com/api/fancollection/1/collection_items
I didn't encounter a scenario where a "older_than_token" was stale, so the problem boils down to getting the "fan_id" given a URL.
This information is located in a blob in the id="pagedata" element.
>>> import json
>>> import requests
>>> from bs4 import BeautifulSoup
>>> res = requests.get("https://www.bandcamp.com/ggorlen")
>>> soup = BeautifulSoup(res.text, "lxml")
>>> user = json.loads(soup.find(id="pagedata")["data-blob"])
>>> user["fan_data"]["fan_id"]
82985
Putting it all together (building upon this answer):
import json
import requests
from bs4 import BeautifulSoup
fan_page_url = "https://www.bandcamp.com/ggorlen"
collection_items_url = "https://bandcamp.com/api/fancollection/1/collection_items"
res = requests.get(fan_page_url)
soup = BeautifulSoup(res.text, "lxml")
user = json.loads(soup.find(id="pagedata")["data-blob"])
data = {
"fan_id": user["fan_data"]["fan_id"],
"older_than_token": user["wishlist_data"]["last_token"],
"count": 10000,
}
res = requests.post(collection_items_url, json=data)
collection = res.json()
for item in collection["items"][:10]:
print(item["album_title"], item["item_url"])
I'm using user["wishlist_data"]["last_token"] which has the same format as the "older_than_token" just in case this matters.
In order to get the entire collection i changed the previous code from
"older_than_token": user["wishlist_data"]["last_token"]
to
user["collection_data"]["last_token"]
which contained the right token
Unfortunately for you, this particular Bandcamp site doesn't seem to make any HTTP API call to fetch the list of albums. You can check that by using your browser developer tools, Network tab, click on XHR filter. The only call being made seems to be fetching your collection details.

Sending Requests in Python Returns None When Trying to Scrape a Specific Web page

shoe = input('Shoe name: ')
URL = 'https://stockx.com/search?s='+shoe
page = requests.get(URL, headers= headers)
soup = BeautifulSoup(page.content, 'html.parser')
time.sleep(2) #this was to ensure the webpage was having enough time to load so that it wouldn't try to scrape a prematurely loaded website.
test = soup.find(class_ = 'BrowseSearchDescription__SearchConfirmation-sc-1mt8qyd-1 dcjzxm')
print(test) #returns none
print(URL) #prings the URL (which is the correct URL of the website I'm attempting to scrape)
I understand that I could easily do this with Selenium, however, it is very inefficient as it loads up the chrome tab and navigates to the web page. I'm trying to make this efficient, and my original "prototype" did use Selenium however it was always detected as a bot and my whole code was stopped by captchas. Am I doing something wrong that is causing the code to return 'None' or is that specific webpage unscrape-able. If you need, the specific URL is https://stockx.com/search?s=yeezy
I tried your code and here is the result.
Code
shoe = 'yeezy'
URL = 'https://stockx.com/search?s='+shoe
page = requests.get(URL)
soup = bs.BeautifulSoup(page.content, 'html.parser')
And when I see what's inside the soup, here is the result.
Result
..
..
<div id="px-captcha">
</div>
<p> Access to this page has been denied because
we believe you are using automation tools to browse the website.</p>
..
..
Yes I guess the developers didn't want the website being scraped.

how to pull the shipping price from banggood.com using beautifulsoup

i'm trying to get the shipping price from this link:
https://www.banggood.com/Xiaomi-Mi-Air-Laptop-2019-13_3-inch-Intel-Core-i7-8550U-8GB-RAM-512GB-PCle-SSD-Win-10-NVIDIA-GeForce-MX250-Fingerprint-Sensor-Notebook-p-1535887.html?rmmds=search&cur_warehouse=CN
but it seems that the "strong" is empty.
i've tried few solutions but all of them gave me an empty "strong"
i'm using beautifulsoup in python 3.
for example this code led me to an empty "strong":
client = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
soup = BeautifulSoup(client.content, 'lxml')
for child in soup.find("span", class_="free_ship").children:
print(child)
The issue is the 'Free Shipping' is generated by JavaScript after the page loads, rather than being sent in the webpage.
It might obtain the Shipping price by performing a HTTP request after the page has loaded or it may be hidden within the page
You might be able to try to find the XHR Request to pulls the Shipping price using DevTools in Firefox or chrome using the 'networking' tab and using that to get the price.
Using the XHR, you can find that data:
import requests
from bs4 import BeautifulSoup
import json
url = 'https://m.banggood.com/ajax/product/dynamicPro/index.html'
payload = {
'c': 'api',
'sq': 'IY38TmCNgDhATYCmIDGxYisATHA7ANn2HwX2RNwEYrcAGAVgDNxawIQFhLpFhkOCuZFFxA'}
response = requests.get(url, params=payload).json()
data = response['result']
shipping = data['shipment']
for each in shipping.items():
print (each)
print (shipping['shipCost'])
Output:
print (shipping['shipCost'])
<b>Free Shipping</b>

Login to aspx website using python requests

I'm trying to log into my school website that utilizes aspx with requests in order to scrape some data. My problem is similar to the one described here:
Log in to ASP website using Python's Requests module
However, my form also requires SubmitButton.x and SubmitButton.y and I don't know where to get them form. I tried to pass in values that worked in manual login, but it didn't work.
Here's the page
form data from successful manual login
from bs4 import BeautifulSoup
import requests
data = {}
with requests.Session() as s:
page = s.get('https://adfslight.resman.pl/LoginPage.aspx?ReturnUrl=%2f%3fwa%3dwsignin1.0%26wtrealm%3dhttps%253a%252f%252fcufs.resman.pl%253a443%252frzeszow%252fAccount%252fLogOn%26wctx%3drm%253d0%2526id%253dADFS%2526ru%253d%25252frzeszow%25252fFS%25252fLS%25253fwa%25253dwsignin1.0%252526wtrealm%25253dhttps%2525253a%2525252f%2525252fuonetplus.resman.pl%2525252frzeszow%2525252fLoginEndpoint.aspx%252526wctx%25253dhttps%2525253a%2525252f%2525252fuonetplus.resman.pl%2525252frzeszow%2525252fLoginEndpoint.aspx%26wct%3d2018-02-04T18%253a08%253a18Z&wa=wsignin1.0&wtrealm=https%3a%2f%2fcufs.resman.pl%3a443%2frzeszow%2fAccount%2fLogOn&wctx=rm%3d0%26id%3dADFS%26ru%3d%252frzeszow%252fFS%252fLS%253fwa%253dwsignin1.0%2526wtrealm%253dhttps%25253a%25252f%25252fuonetplus.resman.pl%25252frzeszow%25252fLoginEndpoint.aspx%2526wctx%253dhttps%25253a%25252f%25252fuonetplus.resman.pl%25252frzeszow%25252fLoginEndpoint.aspx&wct=2018-02-04T18%3a08%3a18Z').content
soup = BeautifulSoup(page, "lxml")
data["__EVENTTARGET"] = ""
data["__EVENTARGUMENT"] = ""
data["___VIEWSTATE"] = soup.select_one("#__VIEWSTATE")["value"]
data["__VIEWSTATEGENERATOR"] = soup.select_one("#__VIEWSTATEGENERATOR")["value"]
data["__EVENTVALIDATION"] = soup.select_one("#__EVENTVALIDATION")["value"]
data["UsernameTextBox"] = "myusername"
data["PasswordTextBox"] = "mypassword"
data["SubmitButton.x"] = "49"
data["SubmitButton.y"] = "1"
s.post('https://adfslight.resman.pl/LoginPage.aspx?ReturnUrl=%2f%3fwa%3dwsignin1.0%26wtrealm%3dhttps%253a%252f%252fcufs.resman.pl%253a443%252frzeszow%252fAccount%252fLogOn%26wctx%3drm%253d0%2526id%253dADFS%2526ru%253d%25252frzeszow%25252fFS%25252fLS%25253fwa%25253dwsignin1.0%252526wtrealm%25253dhttps%2525253a%2525252f%2525252fuonetplus.resman.pl%2525252frzeszow%2525252fLoginEndpoint.aspx%252526wctx%25253dhttps%2525253a%2525252f%2525252fuonetplus.resman.pl%2525252frzeszow%2525252fLoginEndpoint.aspx%26wct%3d2018-02-04T18%253a08%253a18Z&wa=wsignin1.0&wtrealm=https%3a%2f%2fcufs.resman.pl%3a443%2frzeszow%2fAccount%2fLogOn&wctx=rm%3d0%26id%3dADFS%26ru%3d%252frzeszow%252fFS%252fLS%253fwa%253dwsignin1.0%2526wtrealm%253dhttps%25253a%25252f%25252fuonetplus.resman.pl%25252frzeszow%25252fLoginEndpoint.aspx%2526wctx%253dhttps%25253a%25252f%25252fuonetplus.resman.pl%25252frzeszow%25252fLoginEndpoint.aspx&wct=2018-02-04T18%3a08%3a18Z', data=data)
open_page = s.get("https://uonetplus.resman.pl/rzeszow/Start.mvc/Index")
print(open_page.text)

Categories