Scrape webpage with dynamic javascript using Selenium - python

I am trying to scrape a web page with dynamic javascript loading using selenium, but the table that I want to scrape is never actually loading in the source and is instead just showing javascript. The code I am trying to run is the following:
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium import webdriver
from bs4 import BeautifulSoup
url = ""
browser = webdriver.Firefox(executable_path=r'/Users/brendanbernstein/Downloads/geckodriver')
browser.get(url)
WebDriverWait(browser, 10).until(
EC.visibility_of_element_located((By.ID, "maintable")))
html_page = browser.page_source
browser.quit()
soup = BeautifulSoup(html_page)
Unfortunately, even using selenium, the javascript is not executing. The script that executes and generates the table I believe looks like the below and is all I'm seeing in the scraped source code:
<script type="text/javascript">
var hash = window.location.hash.substring(1);
$(document).ready(function () {
if (hash != '') {
activaTab(hash);
} else {
loadIframeSource3();
};
//$('i').tooltip({ placement: 'bottom', trigger: 'manual' }).tooltip('show');
});
function activaTab(tab) {
if (tab.indexOf('comment') >= 0) {
tab = 'comments';
loaddisqus();
} else if (tab.indexOf('readContract') >= 0) {
loadIframeSource();
} else if (tab.indexOf('balances') >= 0) {
loadIframeSource2();
};
$('.nav-tabs a[href="#' + tab + '"]').tab('show');
};
function updatehash(strhash) {
if (strhash == '') {
history.replaceState("", document.title, window.location.pathname);
} else {
var baseUrl = window.location.href.split('#')[0];
history.replaceState("", document.title, baseUrl + '#' + strhash);
}
}
var disqusloaded = false;
function loaddisqus() {
if (disqusloaded == false) {
var dsq = document.createElement('script'); dsq.type = 'text/javascript'; dsq.async = true;
dsq.src = '//' + disqus_shortname + '.disqus.com/embed.js';
(document.getElementsByTagName('head')[0] || document.getElementsByTagName('body')[0]).appendChild(dsq);
disqusloaded = true;
}
updatehash('comments');
}
var readContractLoaded = false;
function loadIframeSource() {
if (readContractLoaded == false) {
readContractLoaded = true;
document.getElementById('readcontractiframe').src = '/readContract?a=0xe94327d07fc17907b4db788e5adf2ed424addff6&v=0xe94327d07fc17907b4db788e5adf2ed424addff6';
}
}
var token_holders_loaded = false;
function loadIframeSource2() {
if (token_holders_loaded == false) {
token_holders_loaded = true;
document.getElementById('tokeholdersiframe').src = '/token/generic-tokenholders2?a=0xe94327d07fc17907b4db788e5adf2ed424addff6&s=11000000000000000000000000';
}
}
var token_transactions_loaded = false;
function loadIframeSource3() {
if (token_transactions_loaded == false) {
token_transactions_loaded = true;
document.getElementById('tokentxnsiframe').src = '/token/generic-tokentxns2?contractAddress=0xe94327d07fc17907b4db788e5adf2ed424addff6&a=&mode=';
}
}
</script>
Any suggestions?

Related

Get the value of a variable stocked in a js code

I am trying to get the dates when available on this datepicker
When availables , the dates are saved in the variable available_dates inside a js script which fills the html input.
HTML of the input :
<input type="text" readonly="" class="form-control-input app_date validate" style="width: 260px;" id="app_date" name="app_date" placeholder="YYYY-MM-DD" onchange="this.form.submit();showLoader();" value="" autocomplete="off">
How can i retrieve the values from available_dates when the dates are availables, get them to my python script to select one of them.
The following python script didn't work :
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
driver.find_element(By.XPATH, "//*[#id='app_date']").click()
time.sleep(3)
try:
print([date.get_attribute("innerHTML") for date in WebDriverWait(driver, 20).until(EC.visibility_of_all_elements_located((By.XPATH, "//div[#class='datepicker-days']/table[#class=' table-condensed']//tr//td[#class='day activeClass' and #title='Book']")))])
except TimeoutException:
print("No booking dates available for the current month")
driver.refresh()
JS script :
<script type="text/javascript" xpath="1">
var today = new Date();
var dd = today.getDate()+1;
var mm = today.getMonth()+1; //January is 0!
var yyyy = today.getFullYear();
if(dd<10){
dd='0'+dd
}
if(mm<10){
mm='0'+mm
}
var today = yyyy+'-'+mm+'-'+dd;
function formatDate(rawDate) {
var day = ("0" + rawDate.getDate()).slice(-2);
var month = ("0" + (rawDate.getMonth() + 1)).slice(-2);
return (day)+ "-" + (month)+ "-" +rawDate.getFullYear() ;
}
$(document).ready(function() {
var dt1 = '2023-02-14';
var checkService = 'Normal';
$('.date_of_birth').datepicker({
format: "yyyy-mm-dd",
endDate: new Date(dt1),
startDate: '-100y',
autoclose: true,
startView: 2
});
$('.pptIssueDate').datepicker({
format: "yyyy-mm-dd",
endDate: new Date(dt1),
startDate: '-100y',
autoclose: true,
startView: 2
});
$('.passport_validate_till').datepicker({
format: "yyyy-mm-dd",
startDate: new Date(dt1),
autoclose: true,
startView: 2
});
var dt4 = '2023-02-15';
var blocked_dates = ["01-01-2023","02-01-2023","06-01-2023","13-01-2023","20-01-2023","27-01-2023","03-02-2023","10-02-2023","17-02-2023","24-02-2023","01-01-2023","02-01-2023","06-01-2023","13-01-2023","20-01-2023","27-01-2023","03-02-2023","10-02-2023","17-02-2023","24-02-2023"];
var available_dates = [];
var fullCapicity_dates = ["15-02-2023","16-02-2023","20-02-2023","21-02-2023","22-02-2023","23-02-2023","27-02-2023","28-02-2023"];
var offDates_dates = ["17-02-2023","18-02-2023","19-02-2023","24-02-2023","25-02-2023","26-02-2023"];
var allowArray = [1,2];
if(checkService == 'Normal')
{
/*if((jQuery.inArray(2, allowArray)!='-1') || (jQuery.inArray(3, allowArray)!='-1') || (jQuery.inArray(4, allowArray)!='-1'))
{
var classFull = 'fullcapspecial';
var tooltipTitle = ' ';
var backDatetitle = 'Not Allowed';
}else{
var classFull = 'fullcap';
var tooltipTitle = 'Slots Full';
var backDatetitle = 'Not Allowed';
}*/
var classFull = 'fullcap';
var tooltipTitle = 'Slots Full';
var backDatetitle = 'Not Allowed';
}else{
var classFull = 'fullcap';
var tooltipTitle = 'Slots Full';
var backDatetitle = 'Not Allowed';
}
$('.app_date').datepicker({
language: "en",
Default: true,
format: "yyyy-mm-dd",
startDate: new Date(dt4),
endDate: '2023-02-28',
autoclose: true,
forceParse:true,
startView: 0,
beforeShowDay: function(date){
var formattedDate = formatDate(date);
if ($.inArray(formattedDate.toString(), blocked_dates) != -1){
return {
enabled : false,
classes: 'inactiveClass',
tooltip: 'Holiday'
};
}
if ($.inArray(formattedDate.toString(), available_dates) != -1){
return {
enabled : true,
classes: 'activeClass',
tooltip: 'Book'
};
}
if ($.inArray(formattedDate.toString(), fullCapicity_dates) != -1){
return {
enabled : false,
classes: classFull,
tooltip: tooltipTitle
};
}
if ($.inArray(formattedDate.toString(), offDates_dates) != -1){
return {
enabled : false,
classes: 'offday',
tooltip: 'Off Day'
};
}
return {
enabled : false,
tooltip: backDatetitle
};
return;
}
});
/*====== CALL POP FOR PL/PT IN NORMAL CASE=======*/
if(checkService == 'Normal')
{
if((jQuery.inArray(2, allowArray)!='-1') || (jQuery.inArray(3, allowArray)!='-1') || (jQuery.inArray(4, allowArray)!='-1'))
{
/*$(document).on('click', '.fullcap,.fullcapspecial', function () {
$(".datepicker").hide();
$('.popupBG').show();
$('#IDBodyPanel').show();
});
$(".popupCloseIcon").click(function() {
$(".popupBG").hide();
$("#IDBodyPanel").hide();
});*/
/*$('input[type=radio][name=serviceChange]').change(function() {
if (this.value == 'Premium') {
$("#premiumService").prop('value', 'GO FOR PREMIUM');
}
else if (this.value == 'Prime') {
$("#premiumService").prop('value', 'GO FOR PRIME TIME');
}
});*/
}
}
/*====== CALL POP FOR PL/PT IN NORMAL CASE=======*/
var eventhandler = function(e) {
e.preventDefault();
}
if (checkService == 'Premium' || checkService == 'Prime' || checkService == 'Premium-Saturday') {
$('input[name="vasId[]"]:checked').each(function() {
$("#vasId"+this.value).bind('click', eventhandler);
});
}
if (checkService != 'Premium')
{
$(document).on('click', '.chkbox', function () {
if($(this).val() == 1)
{
if($(this).is(":checked")){
//$("#vasId6").prop('checked', true);
//$("#vasId6").bind('click', eventhandler);
}else{
//$("#vasId6").prop('checked', false);
//$("#vasId6").unbind('click', eventhandler);
}
}
});
}
});
</script>
Solved

Python requests login with jsencrypted post

ive been trying to login with the following code in a chinese monitoring system that uses js forms.
i assume i cannot log in because the payload is encrypted with a public key and posted to a randomly generated URL.
import sys, re, requests
class GMU:
def __init__(self):
payload = {'inputAccount': 'admin','inputPassword': 'admin'}
s = requests.Session()
p = s.post('http://10.40.100.146/',data=payload)
self.p = p
r = s.get('http://10.40.100.146/ActiveSignal/ActiveSignalPartial?equipmentId=300001003')
data = str(r.content)
self.data = data
def ask():
return GMU()
n = ask()
print(n.p)
print(n.data)
Here is a post header
Payload
Initiator
looking in the initatior section "send" i found loginname.js with the following code:
var publicKey = "-----BEGIN PUBLIC KEY-----MIGfMA0GCSqGSIb3DQEBAQUAA4GNADCBiQKBgQC6MqYui/VMzy0qQE2c6S24fNbph7Hr/Nh29aJJ0kWjINV3zPgJXZrmJp43PaQBxHkq3cESuSf9zUCBS0wZtvFL8LTU1Iehnh/rPVKfGHQaHoD928n7CXuGwnEsyl63p6wwgIjiENVTMaReCJz79N2fhXFK99cA1/B7JXRtCPr/pwIDAQAB-----END PUBLIC KEY-----";
var clientMenuTemplateType = 4;
function logout() {
BootstrapDialog.show({
title: lang["Exit the confirmation"],
message: lang["Quit or not?"],
cssClass: 'user-dialog',
buttons: [{
label: lang["Confirm"],
cssClass: 'btn blue btn-outline dropdown-toggle',
action: function () {
//执行操作
location.href = '.././Account/Logout';
}
}, {
label: lang["Cancel"],
action: function (dialog) {
dialog.close();
}
}]
});
}
$("#loginBtn").click(function () {
Login();
});
$('#inputPassword').bind('keyup', function (event) {
if (event.keyCode == "13") {
//回车执行
Login();
}
});
$('#inputAccount').bind('keyup', function (event) {
if (event.keyCode == "13") {
//回车执行
Login();
}
});
$('#loginBtn').bind('keyup', function (event) {
if (event.keyCode == "13") {
//回车执行
Login();
}
});
var flag = 0;
$("#pswdVisual").click(function () {
var input = document.getElementById('inputPassword');
if (flag == 0) {
input.type = 'text';
$("#pswdVisual").addClass("glyphicon-eye-open");
$("#pswdVisual").removeClass("glyphicon-eye-close");
flag = 1;
} else {
input.type = 'password';
$("#pswdVisual").addClass("glyphicon-eye-close");
$("#pswdVisual").removeClass("glyphicon-eye-open");
flag = 0;
}
});
function Login() {
var options = {
positionClass: 'toast-bottom-right'
}
var account = $('#inputAccount').val();
var password = $('#inputPassword').val();
if (!account || account == null || account === "") {
toastr.error(lang["Username cannot be empty!"], lang["login failure"], options);
} else if (!password || password == null || password === "") {
toastr.error(lang["Password cannot be empty!"], lang["login failure"], options);
} else {
var data = account + "&" + password;
//RSA Encrypt
var jsEncrypt = new JSEncrypt();
jsEncrypt.setPublicKey(publicKey);
var key = jsEncrypt.encrypt(data);
$.ajax({
type: "post",
url: currentUrl + "Account/LoginAction/?t=" + Math.random(),
data: {
encryptContext: key
},
dataType: "json",
success: function (response) {
if (response != null) {
if (response.url !== "") {
cookieObject.setCookie("userName", account);
if (response.url.indexOf("Default/Default?ZT=") !== -1) {
location.href = currentUrl + "Default/Default#ZT=" + response.url.replace("Default/Default?ZT=", "");
} else {
location.href = currentUrl + "Default/Default#Page=" + response.url;
}
}
else if (response.error !== "")
console.log(response.error);
else if (response.show !== "") {
if (response.show == lang["The user name does not exist"]) {
toastr.error("", lang["login failure"], options);
} else {
toastr.error(response.show, lang["login failure"], options);
}
}
}
},
error: function () {
}
});
}
};
i dont know if should be passing the rsa encrypted payload or there is some way to just provide the user and password and let the js thing generate itself.

BeautifulSoup cannot scrape data from HKJC web

I am trying to scrape data from
https://racing.hkjc.com/racing/information/English/Racing/LocalResults.aspx?RaceDate=2020/01/27&Racecourse=ST&RaceNo=1
using BeautifulSoup in python with below simple code,
import requests
from bs4 import BeautifulSoup
url = "https://racing.hkjc.com/racing/information/English/Racing/LocalResults.aspx?RaceDate=2020/01/27&Racecourse=ST&RaceNo=2"
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
print(soup.prettify())
It works occasionally but most of the times it returns the below result,
<html>
<head>
<script>
Challenge=579033;
ChallengeId=232487458;
GenericErrorMessageCookies="Cookies must be enabled in order to view this page.";
</script>
<script>
function test(var1)
{
var var_str=""+Challenge;
var var_arr=var_str.split("");
var LastDig=var_arr.reverse()[0];
var minDig=var_arr.sort()[0];
var subvar1 = (2 * (var_arr[2]))+(var_arr[1]*1);
var subvar2 = (2 * var_arr[2])+var_arr[1];
var my_pow=Math.pow(((var_arr[0]*1)+2),var_arr[1]);
var x=(var1*3+subvar1)*1;
var y=Math.cos(Math.PI*subvar2);
var answer=x*y;
answer-=my_pow*1;
answer+=(minDig*1)-(LastDig*1);
answer=answer+subvar2;
return answer;
}
</script>
<script>
client = null;
if (window.XMLHttpRequest)
{
var client=new XMLHttpRequest();
}
else
{
if (window.ActiveXObject)
{
client = new ActiveXObject('MSXML2.XMLHTTP.3.0');
};
}
if (!((!!client)&&(!!Math.pow)&&(!!Math.cos)&&(!![].sort)&&(!![].reverse)))
{
document.write("Not all needed JavaScript methods are supported.<BR>");
}
else
{
client.onreadystatechange = function()
{
if(client.readyState == 4)
{
var MyCookie=client.getResponseHeader("X-AA-Cookie-Value");
if ((MyCookie == null) || (MyCookie==""))
{
document.write(client.responseText);
return;
}
var cookieName = MyCookie.split('=')[0];
if (document.cookie.indexOf(cookieName)==-1)
{
document.write(GenericErrorMessageCookies);
return;
}
window.location.reload(true);
}
};
y=test(Challenge);
client.open("POST",window.location,true);
client.setRequestHeader('X-AA-Challenge-ID', ChallengeId);
client.setRequestHeader('X-AA-Challenge-Result',y);
client.setRequestHeader('X-AA-Challenge',Challenge);
client.setRequestHeader('Content-Type' , 'text/plain');
client.send();
}
</script>
</head>
<body>
<noscript>
JavaScript must be enabled in order to view this page.
</noscript>
</body>
</html>
Could anyone tell me why it works and crashes?

How to redirect and pass data in angular js with post method

I want to redirect and pass some data to other page.But not using query string and not pass data in URL
(function() {
var app = angular.module('PlateformApp', [])
app.controller('PlateformController', function ($scope,$window) {
//This will hide the DIV by default.
$scope.IsVisible = false;
$scope.ShowHide = function (platform) {
//If DIV is visible it will be hidden and vice versa.
$scope.IsVisible = $scope.IsVisible ? true : true;
//alert(platform);
document.getElementById("platform").value = platform;
var myEl = angular.element( document.querySelector( '#plat_val' ) );
myEl.text(platform);
}
$scope.storeAppWindow = function()
{
//store_url = $scope.storeUrl;
test_bc_url = ""
text_sh_url = ""
platform_val = document.getElementById("platform").value;
$http.get("/user/installedapp")
.then(function(response) {
$scope.myWelcome = response.data;
});
if (platform_val == "BC")
$window.open(test_bc_url, "popup", "width=500,height=400,left=10,top=50");
else if (platform_val == "Sh")
$window.open(text_sh_url, "popup", "width=500,height=400,left=10,top=50");
}
});
})();
Here It will open new window but i want to pass platform_val text_sh_url url in another page.And i am using flask in python.
You can pass the data by various methods, using $stateParams,using Storages, using factories or services. You can find the example using storage in my answer in this thread: sharing data using storages
(function() {
var app = angular.module('PlateformApp', [])
app.factory('newService', function() {
function set(data) {
datatosend = data;
}
function get() {
return datatosend;
}
return {
set: set,
get: get
}
});
app.controller('PlateformController', function($scope, $window, newService) {
//This will hide the DIV by default.
$scope.IsVisible = false;
$scope.ShowHide = function(platform) {
//If DIV is visible it will be hidden and vice versa.
$scope.IsVisible = $scope.IsVisible ? true : true;
//alert(platform);
document.getElementById("platform").value = platform;
var myEl = angular.element(document.querySelector('#plat_val'));
myEl.text(platform);
}
$scope.storeAppWindow = function()
{
//store_url = $scope.storeUrl;
test_bc_url = ""
text_sh_url = ""
platform_val = document.getElementById("platform").value;
$http.get("/user/installedapp")
.then(function(response) {
$scope.myWelcome = response.data;
});
if (platform_val == "BC") {
$window.open(test_bc_url, "popup", "width=500,height=400,left=10,top=50");
}
else if (platform_val == "Sh") {
var data = {
'platform_val': platform_val,
'text_sh_url ': text_sh_url
};
newService.set(data);
$window.open(text_sh_url, "popup", "width=500,height=400,left=10,top=50");
}
}
});
})();
And,in the page where you want to get the data. Just inject newService in the controller of that page and use newService.get(data)
app.controller('pageController',function($scope,newService){
var datafromanothercontroller = newService.get(data);
console.log(datafromanothercontroller );
})

Requests does not return html anymore - Python

I am trying to get a name from a public Linkedin url via python requests (2.7).
The code used to work fine.
import requests
from bs4 import BeautifulSoup
url = "https://www.linkedin.com/in/linustorvalds"
html = requests.get(url).content
link = BeautifulSoup(html).title.text.split("|")[0].replace(" ","")
print link
The desired output is:
linustorvalds
I am getting the following error message:
AttributeError: 'NoneType' object has no attribute 'text'
The issue seems to be that html is not returning the real content of the page. So there is no 'title' found. This is the result of printing html:
<html><head>
<script type="text/javascript">
window.onload = function() {
var newLocation = "";
if (window.location.protocol == "http:") {
var cookies = document.cookie.split("; ");
for (var i = 0; i < cookies.length; ++i) {
if ((cookies[i].indexOf("sl=") == 0) && (cookies[i].length > 3)) {
newLocation = "https:" + window.location.href.substring(window.location.protocol.length);
}
}
}
if (newLocation.length == 0) {
var domain = location.host;
var newDomainIndex = 0;
if (domain.substr(0, 6) == "touch.") {
newDomainIndex = 6;
}
else if (domain.substr(0, 7) == "tablet.") {
newDomainIndex = 7;
}
if (newDomainIndex) {
domain = domain.substr(newDomainIndex);
}
newLocation = "https://" + domain + "/uas/login?trk=sentinel_org_block&session_redirect=" + encodeURIComponent(window.location)
}
window.location.href = newLocation;
}
</script>
</head></html>
Am I being blocked? What are the possible suggestions to make this code work as before?
Thanks a lot!
Try setting a User-Agent header:
html = requests.get(url, headers={"User-Agent": "Requests"}).content

Categories