Parsing site in python using beautifulsoup

Parsing site in python using beautifulsoup - python

I'm having a problem parsing a site for a client. I want to parse the https://okchanger.com/exchangers URL. I have tried post requests using headers and a payload which consists of some form data for the site. When I inspected the site in the networks tab, there was post request to data-table and get for the page itself. I would like to get the names and URLs, but the HTML source doesn't seem to show them (When parsing the HTML and look for the elements, it shows me an empty list). Can you kindly please tell me how would you approach this? Thanks in advance.
import re
import time
import requests
import pandas as pd
from bs4 import BeautifulSoup as bs
class okchangerScraper:
def __init__(self, URL):
self.URL = URL
self.headers = {
'accept': 'application/json, text/javascript, */*; q=0.01',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'en-US,en;q=0.9',
'content-length': '1835',
'content-type': 'application/x-www-form-urlencoded; charset=UTF-8',
'cookie': '__RequestVerificationToken=N5w7MfY6iyx6ExDA6a7kFlKD6rSeYuYE-ExXkw_hOAIK5TpeSb6YUgSPMWWEypMzYNjVELCxA41W7XE0oTJtlLa4TJNIMmsvya8CTCHRkxM1',
'origin': 'https://www.okchanger.com',
'referer': 'https://www.okchanger.com/exchangers',
'sec-fetch-dest': 'empty',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-origin',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36',
'x-requested-with': 'XMLHttpRequest'
}
def scrape_this_page(self, page):
with requests.session() as s:
s.headers = self.headers
payload = {
'draw': '2',
'columns[0][data]': 'Logo',
'columns[0][name]': None,
'columns[0][searchable]': 'true',
'columns[0][orderable]': 'false',
'columns[0][search][value]': None,
'columns[0][search][regex]': 'false',
'columns[1][data]': 'Name',
'columns[1][name]': None,
'columns[1][searchable]': 'true',
'columns[1][orderable]': 'true',
'columns[1][search][value]': None,
'columns[1][search][regex]': 'false',
'columns[2][data]': 'ReserveUSD',
'columns[2][name]': None,
'columns[2][searchable]': 'true',
'columns[2][orderable]': 'true',
'columns[2][search][value]': None,
'columns[2][search][regex]': 'false',
'columns[3][data]': 'Rates',
'columns[3][name]': None,
'columns[3][searchable]': 'true',
'columns[3][orderable]': 'true',
'columns[3][search][value]': None,
'columns[3][search][regex]': 'false',
'columns[4][data]': 'AlexaRank',
'columns[4][name]': None,
'columns[4][searchable]': 'true',
'columns[4][orderable]': 'false',
'columns[4][search][value]': None,
'columns[4][search][regex]': 'false',
'columns[5][data]': 'Popularity',
'columns[5][name]': None,
'columns[5][searchable]': 'true',
'columns[5][orderable]': 'true',
'columns[5][search][value]': None,
'columns[5][search][regex]': 'false',
'columns[6][data]': 'Status',
'columns[6][name]': None,
'columns[6][searchable]': 'true',
'columns[6][orderable]': 'true',
'columns[6][search][value]': None,
'columns[6][search][regex]': 'false',
'columns[7][data]': 'PositiveReviews',
'columns[7][name]': None,
'columns[7][searchable]': 'true',
'columns[7][orderable]': 'true',
'columns[7][search][value]': None,
'columns[7][search][regex]': 'false',
'order[0][column]': '5',
'order[0][dir]': 'desc',
'start': '0',
'length': '100',
'search[value]': None,
'search[regex]': 'false'
}
r = requests.post(self.URL + page + '/data-table',
payload, headers=s.headers)
h = r.status_code
html = r.text
soup = bs(html, 'html.parser')
table = soup.find('tbody')
rows = table.select('tbody > tr:nth-child(1) > td.nowrap')
print(h)
print(len(rows))
if __name__ == "__main__":
scraper = okchangerScraper('https://www.okchanger.com/')
scraper.scrape_this_page('exchangers')

You are receiving json here, not html. Try this:
import json
# ...
content = json.loads(r.text)
print(content)

Related

How can I access dict from JSON dict after making an API call? [duplicate]

This question already has answers here:
What's the best way to parse a JSON response from the requests library?
(3 answers)
What does the 'b' character do in front of a string literal?
(11 answers)
Closed 6 months ago.
i want to access the values in the dict but i can not do response["establishments"] because there is a strange b symbol in the dict when it was return . How can I get rid of the b symbol so i can access the dict ?
b'{"establishments":[{"FHRSID":775937,"ChangesByServerID":0,"LocalAuthorityBusinessID":"13/00068/COMM","BusinessName":"Lexington Catering - Fidessa Restaurant","BusinessType":"Restaurant/Cafe/Canteen","BusinessTypeID":1,"AddressLine1":"Block E First To Seventh Floors Dukes Court Duke Street Woking Surrey","AddressLine2":"","AddressLine3":"","AddressLine4":"","PostCode":"GU21 5BH","Phone":"","RatingValue":"5","RatingKey":"fhrs_5_cy-gb","RatingDate":"2020-01-10T00:00:00","LocalAuthorityCode":"315","LocalAuthorityName":"Woking","LocalAuthorityWebSite":"http://www.woking.gov.uk","LocalAuthorityEmailAddress":"emma.bourne#woking.gov.uk","scores":{"Hygiene":0,"Structural":0,"ConfidenceInManagement":0},"SchemeType":"FHRS","geocode":{"longitude":"-0.554158","latitude":"51.320771"},"RightToReply":"","Distance":null,"NewRatingPending":false,"meta":{"dataSource":null,"extractDate":"0001-01-01T00:00:00","itemCount":0,"returncode":null,"totalCount":0,"totalPages":0,"pageSize":0,"pageNumber":0},"links":[{"rel":"self","href":"https://api.ratings.food.gov.uk/establishments/775937"}]},{"FHRSID":1474143,"ChangesByServerID":0,"LocalAuthorityBusinessID":"22/00013/COMM","BusinessName":"Duke\'s Bar & Deli","BusinessType":"Other catering premises","BusinessTypeID":7841,"AddressLine1":"Dukes Bar And Deli 3 Duke Street Woking Surrey","AddressLine2":"","AddressLine3":"","AddressLine4":"","PostCode":"GU21 5BH","Phone":"","RatingValue":"4","RatingKey":"fhrs_4_cy-gb","RatingDate":"2022-02-24T00:00:00","LocalAuthorityCode":"315","LocalAuthorityName":"Woking","LocalAuthorityWebSite":"http://www.woking.gov.uk","LocalAuthorityEmailAddress":"emma.bourne#woking.gov.uk","scores":{"Hygiene":10,"Structural":0,"ConfidenceInManagement":10},"SchemeType":"FHRS","geocode":{"longitude":null,"latitude":null},"RightToReply":"","Distance":null,"NewRatingPending":false,"meta":{"dataSource":null,"extractDate":"0001-01-01T00:00:00","itemCount":0,"returncode":null,"totalCount":0,"totalPages":0,"pageSize":0,"pageNumber":0},"links":[{"rel":"self","href":"https://api.ratings.food.gov.uk/establishments/1474143"}]}],"meta":{"dataSource":"Lucene","extractDate":"2022-09-03T00:50:19.6362148+01:00","itemCount":2,"returncode":"OK","totalCount":2,"totalPages":1,"pageSize":5000,"pageNumber":1},"links":[{"rel":"self","href":"https://api.ratings.food.gov.uk/establishments?address=gu21%205bh"}]}'
here is my code
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36",
"Upgrade-Insecure-Requests": "1",
"DNT": "1",
# "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "cy-GB",
"Accept-Encoding": "gzip, deflate, br",
"x-api-version": "2",
# 'Content-Type': 'application/json'
}
def get_web_page_content(post_code):
print('post_code', post_code)
BOMS = [
codecs.BOM,
codecs.BOM_BE,
codecs.BOM_LE,
codecs.BOM_UTF8,
codecs.BOM_UTF16,
codecs.BOM_UTF16_BE,
codecs.BOM_UTF16_LE,
codecs.BOM_UTF32,
codecs.BOM_UTF32_BE,
codecs.BOM_UTF32_LE,
]
url = rf'https://api.ratings.food.gov.uk/Establishments?address={post_code}'
time.sleep(5)
response = requests.get(url, headers=headers)
data = response.content
for BOM in BOMS:
if data.startswith(BOM):
data = json.loads(data[len(BOM):])
break
return data
if __name__ == '__main__':
response = get_web_page_content('GU21 5BH')
print('response', response)

You can access response in different formats e.g bytes, json or raw. Here in your case you are using the bytes way that's why the result is in byte which prefixed with b, if you want to access it as dict like syntax i.e response["establishments"] then you need to use response.json()
Like,
import requests
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36",
"Upgrade-Insecure-Requests": "1",
"DNT": "1",
# "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "cy-GB",
"Accept-Encoding": "gzip, deflate, br",
"x-api-version": "2",
#'Content-Type': 'application/json'
}
def get_web_page_content(post_code):
print('post_code', post_code)
url = rf'https://api.ratings.food.gov.uk/Establishments?address={post_code}'
response = requests.get(url, headers=headers)
data = response.json()
return data
if __name__ == '__main__':
response = get_web_page_content('GU21 5BH')
print('response', response)
print('establishments', response["establishments"])

The "strange" b'...' symbol means you have got bytes object - that's what response.content returns. To decode json string use response.json():
import requests
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36",
"Upgrade-Insecure-Requests": "1",
"DNT": "1",
"Accept-Language": "cy-GB",
"Accept-Encoding": "gzip, deflate, br",
"x-api-version": "2",
}
def get_web_page_content(post_code):
print("post_code", post_code)
url = rf"https://api.ratings.food.gov.uk/Establishments?address={post_code}"
response = requests.get(url, headers=headers)
return response.json() # <-- use .json() here to parse JSON response
if __name__ == "__main__":
response = get_web_page_content("GU21 5BH")
print("response", response)
Prints:
post_code GU21 5BH
response {'establishments': [{'FHRSID': 775937, 'ChangesByServerID': 0, 'LocalAuthorityBusinessID': '13/00068/COMM', 'BusinessName': 'Lexington Catering - Fidessa Restaurant', 'BusinessType': 'Restaurant/Cafe/Canteen', 'BusinessTypeID': 1, 'AddressLine1': 'Block E First To Seventh Floors Dukes Court Duke Street Woking Surrey', 'AddressLine2': '', 'AddressLine3': '', 'AddressLine4': '', 'PostCode': 'GU21 5BH', 'Phone': '', 'RatingValue': '5', 'RatingKey': 'fhrs_5_cy-gb', 'RatingDate': '2020-01-10T00:00:00', 'LocalAuthorityCode': '315', 'LocalAuthorityName': 'Woking', 'LocalAuthorityWebSite': 'http://www.woking.gov.uk', 'LocalAuthorityEmailAddress': 'emma.bourne#woking.gov.uk', 'scores': {'Hygiene': 0, 'Structural': 0, 'ConfidenceInManagement': 0}, 'SchemeType': 'FHRS', 'geocode': {'longitude': '-0.554158', 'latitude': '51.320771'}, 'RightToReply': '', 'Distance': None, 'NewRatingPending': False, 'meta': {'dataSource': None, 'extractDate': '0001-01-01T00:00:00', 'itemCount': 0, 'returncode': None, 'totalCount': 0, 'totalPages': 0, 'pageSize': 0, 'pageNumber': 0}, 'links': [{'rel': 'self', 'href': 'https://api.ratings.food.gov.uk/establishments/775937'}]}, {'FHRSID': 1474143, 'ChangesByServerID': 0, 'LocalAuthorityBusinessID': '22/00013/COMM', 'BusinessName': "Duke's Bar & Deli", 'BusinessType': 'Other catering premises', 'BusinessTypeID': 7841, 'AddressLine1': 'Dukes Bar And Deli 3 Duke Street Woking Surrey', 'AddressLine2': '', 'AddressLine3': '', 'AddressLine4': '', 'PostCode': 'GU21 5BH', 'Phone': '', 'RatingValue': '4', 'RatingKey': 'fhrs_4_cy-gb', 'RatingDate': '2022-02-24T00:00:00', 'LocalAuthorityCode': '315', 'LocalAuthorityName': 'Woking', 'LocalAuthorityWebSite': 'http://www.woking.gov.uk', 'LocalAuthorityEmailAddress': 'emma.bourne#woking.gov.uk', 'scores': {'Hygiene': 10, 'Structural': 0, 'ConfidenceInManagement': 10}, 'SchemeType': 'FHRS', 'geocode': {'longitude': None, 'latitude': None}, 'RightToReply': '', 'Distance': None, 'NewRatingPending': False, 'meta': {'dataSource': None, 'extractDate': '0001-01-01T00:00:00', 'itemCount': 0, 'returncode': None, 'totalCount': 0, 'totalPages': 0, 'pageSize': 0, 'pageNumber': 0}, 'links': [{'rel': 'self', 'href': 'https://api.ratings.food.gov.uk/establishments/1474143'}]}], 'meta': {'dataSource': 'Lucene', 'extractDate': '2022-09-03T00:59:03.9485729+01:00', 'itemCount': 2, 'returncode': 'OK', 'totalCount': 2, 'totalPages': 1, 'pageSize': 5000, 'pageNumber': 1}, 'links': [{'rel': 'self', 'href': 'https://api.ratings.food.gov.uk/establishments?address=gu21%205bh'}]}

You can get the JSON using response.json:
print("response", response.json)
The strange b thingy represents that it is a bytes object.

Scraping a table on Barchart website using python but appear only one signle data

I used the script in the above link to get a table on Barchart website and it somehow only scraped single-day data, instead of the data that appeared on the whole page. I guess some of the fill I was wrong and I couldn't figure out how to fix it.
import requests
from urllib.parse import unquote
geturl=r'https://www.barchart.com/stocks/quotes/AAPL%7C20210423%7C126.00C/price-history/'
apiurl=r'https://www.barchart.com/proxies/core-api/v1/quotes/get'
getheaders={
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'en-US,en;q=0.9',
'cache-control': 'max-age=0',
'upgrade-insecure-requests': '1',
"referer":"https://www.barchart.com/stocks/quotes/AAPL%7C20210423%7C126.00C/price-history/historical",
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36'
}
s=requests.Session()
r=s.get(geturl, headers=getheaders)
headers={
'accept': 'application/json',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'en-US,en;q=0.9',
'user-agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36",
'x-xsrf-token': unquote(unquote(s.cookies.get_dict()['XSRF-TOKEN']))
}
payload={
"symbol":"AAPL|20210423|126.00C",
"fields":"tradeTime.format(m\/d\/Y),openPrice,highPrice,lowPrice,lastPrice,priceChange,percentChange,volume,openInterest,impliedVolatility,symbolCode,symbolType",
"type":"eod",
"orderBy":"tradeTime",
"orderDir":"desc",
"limit":65,
#"meta":"field.shortName,field.type,field.description",
'raw': '1'
}
r=s.get(apiurl,params=payload,headers=headers)
j=r.json()
print(j)
Output:
{'count': 1, 'total': 1, 'data': [{'tradeTime': '04/21/2021', 'openPrice': '6.65', 'highPrice': '7.65', 'lowPrice': '5.56', 'lastPrice': '7.58', 'priceChange': '+0.45', 'percentChange': '+6.31%', 'volume': '1,213', 'openInterest': '3,951', 'impliedVolatility': 'N/A', 'symbolCode': 'STKOPT', 'symbolType': 34, 'raw': {'tradeTime': 1619036084, 'openPrice': 6.65, 'highPrice': 7.65, 'lowPrice': 5.56, 'lastPrice': 7.58, 'priceChange': '0.45', 'percentChange': 0.0631, 'volume': 1213, 'openInterest': 3951, 'impliedVolatility': None, 'symbolCode': 'STKOPT', 'symbolType': 34}}]}
I expected it can scrape 3 months' data, can anyone figure out the problem? Thanks

This worked for me:
Replace the API 'https://www.barchart.com/proxies/core-api/v1/quotes/get'
with
'https://www.barchart.com/proxies/core-api/v1/historical/get'
Output
{'count': 32, 'total': 4, 'data': [{'tradeTime': '04/23/2021', 'openPrice': '6.16', 'highPrice': '9.09', '...

Python Scrape NBA Tracking Drives Data

I am fairly new to Python. I am trying to scrape NBA Drives data via https://stats.nba.com/players/drives/
I used Chrome Devtools to find the API URL. I then used the requests package to get the JSON string.
Original code:
import requests
headers = {"User-Agent": "Mozilla/5.0..."}
url = " https://stats.nba.com/stats/leaguedashptstats?College=&Conference=&Country=&DateFrom=&DateTo=&Division=&DraftPick=&DraftYear=&GameScope=&Height=&LastNGames=0&LeagueID=00&Location=&Month=0&OpponentTeamID=0&Outcome=&PORound=0&PerMode=PerGame&PlayerExperience=&PlayerOrTeam=Player&PlayerPosition=&PtMeasureType=Drives&Season=2019-20&SeasonSegment=&SeasonType=Regular+Season&StarterBench=&TeamID=0&VsConference=&VsDivision=&Weight="
r = requests.get(url, headers = headers)
d = r.json()
This no longer works, however. For some reason the request for the URL link below times out on the NBA server. So I need to find a new way to get this information.
< https://stats.nba.com/stats/leaguedashptstats?College=&Conference=&Country=&DateFrom=&DateTo=&Division=&DraftPick=&DraftYear=&GameScope=&Height=&LastNGames=0&LeagueID=00&Location=&Month=0&OpponentTeamID=0&Outcome=&PORound=0&PerMode=PerGame&PlayerExperience=&PlayerOrTeam=Player&PlayerPosition=&PtMeasureType=Drives&Season=2019-20&SeasonSegment=&SeasonType=Regular+Season&StarterBench=&TeamID=0&VsConference=&VsDivision=&Weight=>
I was exploring Chrome devtools and I found out that the desired JSON string was stored in the Network XHR Response tab. Is there any way to scrape that into python. See the image below.
Chrome Devtools: XHR Response JSON string

I tested url with other headers (which I saw in DevTool for this request) and it seems it needs header Referer to work correctly
EDIT 2020.08.15:
I had to add new headers to read it
'x-nba-stats-origin': 'stats',
'x-nba-stats-token': 'true',
import requests
headers = {
'User-Agent': 'Mozilla/5.0',
#'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:72.0) Gecko/20100101 Firefox/72.0',
'Referer': 'https://stats.nba.com/players/drives/',
#'Accept': 'application/json, text/plain, */*',
'x-nba-stats-origin': 'stats',
'x-nba-stats-token': 'true',
}
url = 'https://stats.nba.com/stats/leaguedashptstats?College=&Conference=&Country=&DateFrom=&DateTo=&Division=&DraftPick=&DraftYear=&GameScope=&Height=&LastNGames=0&LeagueID=00&Location=&Month=0&OpponentTeamID=0&Outcome=&PORound=0&PerMode=PerGame&PlayerExperience=&PlayerOrTeam=Player&PlayerPosition=&PtMeasureType=Drives&Season=2019-20&SeasonSegment=&SeasonType=Regular+Season&StarterBench=&TeamID=0&VsConference=&VsDivision=&Weight='
r = requests.get(url, headers=headers)
data = r.json()
print(data)
BTW: the same but with params as dictionary so it is easier to set different value
import requests
headers = {
'User-Agent': 'Mozilla/5.0',
#'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:72.0) Gecko/20100101 Firefox/72.0',
'Referer': 'https://stats.nba.com/players/drives/',
#'Accept': 'application/json, text/plain, */*',
'x-nba-stats-origin': 'stats',
'x-nba-stats-token': 'true',
}
url = 'https://stats.nba.com/stats/leaguedashptstats'
params = {
'College': '',
'Conference': '',
'Country': '',
'DateFrom': '',
'DateTo': '',
'Division': '',
'DraftPick': '',
'DraftYear': '',
'GameScope': '',
'Height': '',
'LastNGames': '0',
'LeagueID': '00',
'Location': '',
'Month': '0',
'OpponentTeamID': '0',
'Outcome': '',
'PORound': '0',
'PerMode': 'PerGame',
'PlayerExperience': '',
'PlayerOrTeam': 'Player',
'PlayerPosition': '',
'PtMeasureType': 'Drives',
'Season': '2019-20',
'SeasonSegment': '',
'SeasonType': 'Regular Season',
'StarterBench': '',
'TeamID': '0',
'VsConference': '',
'VsDivision': '',
'Weight': '',
}
r = requests.get(url, headers=headers, params=params)
#print(r.request.url)
data = r.json()
print(data)

Extracting Text from Javascript or Ajax based webpages?

Is there a way to scrap the text from Javascript based sites example: https://www.ajio.com/ajio-mid-rise-slim-fit-cargo-pants/p/460151939_brown
I need the product specifications from this page How can I do this?

Product details can easily be extracted using selenium webdriver -
from selenium import webdriver
driver = webdriver.Chrome()
driver.get('https://www.ajio.com/ajio-mid-rise-slim-fit-cargo-pants/p/460151939_brown')
list_product = driver.find_elements_by_xpath('//ul[#class="prod-list"]/li')
description_1 = list_product[0].text
Similarity, you can extract all other values.

Without selenium, just regexp.
import re
import json
import requests
from pprint import pprint
from sys import exit
headers = {
'Connection': 'keep-alive',
'Cache-Control': 'max-age=0',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'DNT': '1',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'es-ES,es;q=0.9,en;q=0.8',
}
response = requests.get('https://www.ajio.com/ajio-mid-rise-slim-fit-cargo-pants/p/460151939_brown', headers=headers)
html = response.content
regex = ur"<script>\s+window.__PRELOADED_STATE__ =(.*);\s+<\/script>\s+<script\s+id\s+=\s+\"appJs\""
data = re.findall(regex, html, re.MULTILINE | re.DOTALL)[0]
json = json.loads(data)
details = []
for row in json['product']['productDetails']['featureData']:
try:
value = row['featureValues'][0]['value']
except KeyError:
value = None
finally:
details.append({'name': row['name'], 'value' : value})
pprint(details)
Result:
[{'name': u'Highlight', 'value': u'Multiple pockets'},
{'name': u'Hidden Detail', 'value': u'Belt loops'},
{'name': u'Additional Informations', 'value': u'Zip fly closure'},
{'name': u'Waist Rise', 'value': u'Mid Rise'},
{'name': u'Fabric Composition', 'value': u'100% Cotton'},
{'name': u'Size worn by Model', 'value': u'32'},
{'name': u'Fit Type', 'value': u'Straight Fit'},
{'name': u'Size Detail', 'value': u'Fits true to standard size on the model'},
{'name': u'Wash Care', 'value': u'Machine wash'},
{'name': u'Model Waist Size', 'value': u'32"'},
{'name': u'Model Height', 'value': u"6'"},
{'name': u'Size Format', 'value': None}]

Posting Payment Data

I've been working on this script. It's a script for auto checkout on shopify based site like this (https://www.cityblueshop.com/products/kixx_asst). My problem is everything works fine, except submitting the payment data. For some reason it won't post the payment, even though I'm correctly extracting the id for cc_verify_id. If you guys can test it out and let me know what I'm doing wrong (stuck on this step for several days), then it will be really appreciated. You can input fake contact and credit card information. P.S. I'm new to programming so it might look messy. Thanks in advance for your help.
[EDIT]It looks like it's not submitting the data properly from paymentdata, but I still can't pinpoint where's the problem.
import requests, sys, time, re
from datetime import datetime
from bs4 import BeautifulSoup
from urllib.parse import urlparse
s = requests.session()
def UTCtoEST():
current=datetime.now()
return str(current) + ' EST'
home = 'cityblueshop'
###Get Session Id###
session = s.get('https://www.'+home+'.com/cart.js').json()
sessionID = session['token']
print('SessionID:', sessionID)
###ATC###
print(UTCtoEST(), 'Adding item....')
atcdata = {
'id': '37431305678',
'quantity': '1'
}
for atcurlRetry in range(1):
atcURL = s.post('https://www.'+home+'.com/cart/add.js', data=atcdata, allow_redirects=True)
match = re.findall('"quantity":1', atcURL.text)
if match:
print(UTCtoEST(), 'ATC successful....')
break
print(UTCtoEST(), 'Trying to ATC....')
time.sleep(0)
else:
print(UTCtoEST(), 'Could not ATC after ' + ' retries, therefore exiting the bot.')
sys.exit(1)
###Going to Checkout Page###
for cartRetry in range(1):
cartdata = {
'updates[]': 1,
'note': '',
'checkout': 'Check Out'
}
atc = s.post('https://www.'+home+'.com/cart', data=cartdata, allow_redirects=True)
###Parsing URL###
parse = urlparse(atc.url)
storeID = parse.path.split('/')[1]
checkoutID = parse.path.split('checkouts/')[1]
print('Checkout Session Id:', checkoutID)
###Get Token###
soup = BeautifulSoup(atc.text, 'lxml')
input = soup.find_all('input')[2]
auth_token = input.get('value')
print('Auth_token:', auth_token)
###Get Contact info###
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.8',
'Host': 'checkout.shopify.com',
'Referer': 'https: //checkout.shopify.com/'+storeID+'/checkouts/'+checkoutID+'?step=contact_information',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36'
}
qs = {
'utf8': '✓',
'_method': 'patch',
'authenticity_token': auth_token,
'previous_step': 'contact_information',
'checkout[email]': 'email',
'checkout[shipping_address][first_name]': 'First',
'checkout[shipping_address][last_name]': 'Last',
'checkout[shipping_address][company]': '',
'checkout[shipping_address][address1]': 'Address 1',
'checkout[shipping_address][address2]': '',
'checkout[shipping_address][city]': 'City',
'checkout[shipping_address][country]': 'United States',
'checkout[shipping_address][province]': '',
'checkout[shipping_address][province]': '',
'checkout[shipping_address][province]': 'New York',
'checkout[shipping_address][zip]': 'Zip',
'checkout[shipping_address][phone]': 'Phone',
'checkout[remember_me]': '',
'checkout[remember_me]': '0',
'checkout[client_details][browser_width]': '979',
'checkout[client_details][browser_height]': '631',
'checkout[client_details][javascript_enabled]': '1',
'step': 'contact_information'
}
GETcontact = s.get(atc.url, data=qs, headers=headers, allow_redirects=True)
###Post Contact Info###
headers1 = {
'Origin': 'https://checkout.shopify.com',
'Content-Type': 'application/x-www-form-urlencoded',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.8',
'Referer': 'https://checkout.shopify.com/'+storeID+'/checkouts/'+checkoutID,
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36'
}
formData = {
'utf8': '✓',
'_method': 'patch',
'authenticity_token': auth_token,
'button': '',
'checkout[email]': 'Email',
'checkout[shipping_address][first_name]': 'First',
'checkout[shipping_address][last_name]': 'Last',
'checkout[shipping_address][company]': '',
'checkout[shipping_address][address1]': 'Address 1',
'checkout[shipping_address][address2]': '',
'checkout[shipping_address][city]': 'City',
'checkout[shipping_address][country]': 'United States',
'checkout[shipping_address][province]': 'New York',
'checkout[shipping_address][zip]': 'Zip',
'checkout[shipping_address][phone]': 'Phone',
'checkout[remember_me]': '0',
'checkout[client_details][browser_width]': '979',
'checkout[client_details][browser_height]': '631',
'checkout[client_details][javascript_enabled]': '1',
'previous_step': 'contact_information',
'step': 'shipping_method'
}
POSTcontact = s.post(atc.url, data=formData, headers=headers1, allow_redirects=True)
###Parsing Shipping Method###
soup = BeautifulSoup(POSTcontact.text, 'html.parser')
shipping = soup.find(attrs={'class': 'radio-wrapper'})
shipping_method = ship.get('data-shipping-method')
###Submitting Shipping Data###
headers2 = {
'Origin': 'https://checkout.shopify.com',
'Content-Type': 'application/x-www-form-urlencoded',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.8',
'Referer': 'https://checkout.shopify.com/'+storeID+'/checkouts/'+checkoutID,
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36'
}
ShipformData = {
'utf8': '✓',
'_method': 'patch',
'authenticity_token': auth_token,
'previous_step': 'shipping_method',
'step': 'payment_method',
'checkout[shipping_rate][id]': shipping_method,
'button': '',
'checkout[client_details][browser_width]': '1280',
'checkout[client_details][browser_height]': '368',
'checkout[client_details][javascript_enabled]': '1'
}
shippingmethod = s.post(atc.url, data=ShipformData, headers=headers2, allow_redirects=True)
###Parsing payment_gateaway###
soup = BeautifulSoup(shippingmethod.text, 'html.parser')
ul = soup.find(attrs={'class': 'radio-wrapper content-box__row '})
payment_gateaway = ul.get('data-select-gateway')
###submitting payment info###
CCheaders = {
'accept': 'application/json',
'Origin': 'https://checkout.shopifycs.com',
'Accept-Language': 'en-US,en;q=0.8',
'Host': 'elb.deposit.shopifycs.com',
'content-type': 'application/json',
'Referer': 'https://checkout.shopifycs.com/number?identifier='+checkoutID+'&location=3A%2F%2Fcheckout.shopify.com%2F'+storeID+'%2Fcheckouts%2F'+checkoutID+'%3Fpreviousstep%3Dshipping_method%26step%3Dpayment_method',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36'
}
ccinfo = {
'number': "0000 0000 0000 0000",
'name': "First Last",
'month': 5,
'year': 2020,
'verification_value': "000"
}
creditcard = s.post('https://elb.deposit.shopifycs.com/sessions', json=ccinfo, headers=CCheaders, allow_redirects=True)
cc_verify = creditcard.json()
cc_verify_id = cc_verify['id']
###submitting credit card info##
paymentheaders = {
'Origin': 'https://checkout.shopify.com',
'Content-Type': 'application/x-www-form-urlencoded',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.8',
'Referer': 'https://checkout.shopify.com/'+storeID+'/checkouts/'+checkoutID,
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36'
}
paymentdata = {
'_method': 'patch',
'authenticity_token': auth_token,
'checkout[buyer_accepts_marketing]': '1',
'checkout[client_details][browser_height]': '979',
'checkout[client_details][browser_width]': '631',
'checkout[client_details][javascript_enabled]': '1',
'checkout[credit_card][vault]': 'false',
'checkout[different_billing_address]': 'false',
'checkout[payment_gateway]': payment_gateaway,
'checkout[total_price]': '1199',
'complete': '1',
'previous_step': 'payment_method',
's': cc_verify_id,
'step': '',
'utf8': '✓'
}
submitpayment = s.post(atc.url, data=paymentdata, headers=paymentheaders, allow_redirects=True)
print(UTCtoEST(), submitpayment.status_code, submitpayment.url)

Just a guess, but this isn't a proper key if you are trying to post JSON..
'checkout[total_price]': '1199',
You need to rewrite as
'checkout' : {
'total_price': '1199',
}
And you need to apply this solution to all the other values in that format
For example
'checkout' : {
'remember_me' : '',
'shipping_address' : {
'first_name': 'First',
"last_name': 'Last'
And I think you can use Python False value instead of the string ' false', but that depends on the API

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Parsing site in python using beautifulsoup - python

You are receiving json here, not html. Try this: import json # ... content = json.loads(r.text) print(content)

Related

How can I access dict from JSON dict after making an API call? [duplicate]

Scraping a table on Barchart website using python but appear only one signle data

Python Scrape NBA Tracking Drives Data

Extracting Text from Javascript or Ajax based webpages?

Posting Payment Data

Categories

Resources