How to get data from javascript rendered table using selenium in python

How to get data from javascript rendered table using selenium in python - python

I have a website to scrape and i am using selenium to do it. When i finished writing the code, i noticed that i was not getting output at all when i print the table contents. I viewed the page source and then i found out that the table was not in the source. That is why even i find the xpath of the table from inspect element i cant get any output of it. Do someone know how could I get the response/data or just printing the table from the javascript response? Thanks.
Here is my current code
from bs4 import BeautifulSoup
from selenium import webdriver
import time
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
options = Options()
options.add_argument('--incognito')
chrome_path = r"C:\chromedriver.exe"
driver = webdriver.Chrome(chrome_path,options=options)
driver.implicitly_wait(3)
url = "https://reversewhois.domaintools.com/?refine#q=
%5B%5B%5B%22whois%22%2C%222%22%2C%22VerifiedID%40SG-Mandatory%22%5D%5D%5D"
driver.get(url)
html = driver.page_source
soup = BeautifulSoup(html,'lxml')
#These line of codes is for selecting the desired search parameter from the combo box, you can disregard this since i was putting the whole url with params
input = driver.find_element_by_xpath('//*[#id="q0"]/div[2]/div/div[1]/div[3]/input')
driver.find_element_by_xpath('//*[#id="q0"]/div[2]/div/div[1]/div[1]/div').click()
driver.find_element_by_xpath('//*[#id="q0"]/div[2]/div/div[1]/div[5]/div[1]/div/div[3]').click()
driver.find_element_by_xpath('//*[#id="q0"]/div[2]/div/div[1]/div[2]/div/div[1]').click()
driver.find_element_by_xpath('//*[#id="q0"]/div[2]/div/div[1]/div[6]/div[1]/div/div[1]').click
input.send_keys("VerifiedID#SG-Mandatory")
driver.find_element_by_xpath('//*[#id="search-button-container"]/button').click()
table = driver.find_elements_by_xpath('//*[#id="refine-preview-content"]/table/tbody/tr/td')
for i in table:
print(i) no output
I just want to scrape all the domain names like in the first result like 0 _ _ .sg

You can try the below code. After you have selected all the details options to fill and click on the search button it is kind of an implicit wait to make sure we get the full page source. Then we used the read_html from pandas which searches for any tables present in the html and returns a list of dataframe. we take the required df from there.
from selenium import webdriver
import time
from selenium.webdriver.chrome.options import Options
import pandas as pd
options = Options()
options.add_argument('--incognito')
chrome_path = r"C:/Users/prakh/Documents/PythonScripts/chromedriver.exe"
driver = webdriver.Chrome(chrome_path,options=options)
driver.implicitly_wait(3)
url = "https://reversewhois.domaintools.com/?refine#q=%5B%5B%5B%22whois%22%2C%222%22%2C%22VerifiedID%40SG-Mandatory%22%5D%5D%5D"
driver.get(url)
#html = driver.page_source
#soup = BeautifulSoup(html,'lxml')
#These line of codes is for selecting the desired search parameter from the combo box
input = driver.find_element_by_xpath('//*[#id="q0"]/div[2]/div/div[1]/div[3]/input')
driver.find_element_by_xpath('//*[#id="q0"]/div[2]/div/div[1]/div[1]/div').click()
driver.find_element_by_xpath('//*[#id="q0"]/div[2]/div/div[1]/div[5]/div[1]/div/div[3]').click()
driver.find_element_by_xpath('//*[#id="q0"]/div[2]/div/div[1]/div[2]/div/div[1]').click()
driver.find_element_by_xpath('//*[#id="q0"]/div[2]/div/div[1]/div[6]/div[1]/div/div[1]').click
input.send_keys("VerifiedID#SG-Mandatory")
driver.find_element_by_xpath('//*[#id="search-button-container"]/button').click()
time.sleep(5)
html = driver.page_source
tables = pd.read_html(html)
df = tables[-1]
print(df)

If you are open to other ways does the following give the expected results? It mimics the xhr the page does (though I have trimmed it down to essential elements only) to retrieve the lookup results. Faster than using a browser.
from bs4 import BeautifulSoup as bs
import requests
import pandas as pd
headers = {'User-Agent': 'Mozilla/5.0'}
r = requests.get('https://reversewhois.domaintools.com/?ajax=mReverseWhois&call=ajaxUpdateRefinePreview&q=[[[%22whois%22,%222%22,%22VerifiedID#SG-Mandatory%22]]]&sf=true', headers=headers)
table = pd.read_html(r.json()['results'])
print(table)

Related

Unable to scrape text using Selenium and BeautifulSoup

I am trying to automate acquiring data for a research project from Morningstar using Selenium and BeautifulSoup in Python. I am quite new in Python so I have just tried a bunch of solutions from Stackoverflow and similar fora, but I have been unsuccessful.
What I am trying to scrape is on the url https://www.morningstar.dk/dk/funds/snapshot/snapshot.aspx?id=F000014CU8&tab=3
In the url, I am specifically looking for the "Faktorprofil" for which you can click to show the data as a table. I can get the headings from the url, but I am unable to soup.find any other text. I have tried using multiple ids and classes, but without any luck. The code I believe I have been the most successful with is written below.
I hope someone can help!
from bs4 import BeautifulSoup
import os
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
opts = Options()
opts.add_argument(" --headless")
chrome_driver = os.getcwd() +"/chromedriver"
driver = webdriver.Chrome(options=opts, executable_path=chrome_driver)
driver.get("https://www.morningstar.dk/dk/funds/snapshot/snapshot.aspx?id=F00000ZG2E&tab=3")
soup_file=driver.page_source
soup = BeautifulSoup(soup_file, 'html.parser')
print(soup.title.get_text())
#print(soup.find(class_='').get_text())
#print(soup.find(id='').get_text())
This is the data I want to scrape
[1]: https://i.stack.imgur.com/wkSMj.png

All those tables are in an iframe. Below code will retrieve the data and prints as a list.
driver.implicitly_wait(10)
driver.get("https://www.morningstar.dk/dk/funds/snapshot/snapshot.aspx?id=F000014CU8&tab=3")
driver.switch_to.frame(1)
driver.find_element_by_xpath("//button[contains(#class,'show-table')]//span").click()
table = driver.find_elements_by_xpath("//div[contains(#class,'sal-mip-factor-profile__value-table')]/table//tr/th")
header = []
for tab in table:
header.append(tab.text)
print(header)
tablebody = driver.find_elements_by_xpath("//div[contains(#class,'sal-mip-factor-profile__value-table')]/table//tbody/tr")
for tab in tablebody:
data = []
content = tab.find_elements_by_tag_name("td")
for con in content:
data.append(con.text)
print(data)
Output:
['Faktorer', 'Fonds værdi', '5 år Min. Værdi', '5 år Maks værdi', 'Kategori Gennemsnitlig']
['Stil', '62,33', '31,52', '76,36', '48,20']
['Effektiv rente', '48,83', '20,82', '69,12', '34,74']
['Momentum', '58,47', '7,48', '77,21', '71,15']
['Kvalitet', '25,65', '21,61', '59,66', '38,15']
['Volatilitet', '45,25', '34,66', '81,08', '74,93']
['Likviditet', '35,70', '33,40', '74,94', '79,39']
['Størrelse', '39,60', '35,67', '48,78', '87,59']

Can't identify a table when scraping

Beginner question.. I'm attempting to scrape data from a table but I can't seem to recognize it, I've tried using the class and the id to identify it but my result is 0. The code and output are below.
# Import necessary packages
from bs4 import BeautifulSoup
import requests
import pandas as pd
import re
# Site URL
url="https://fbref.com/en/comps/9/stats/Premier-League-Stats"
# Make a GET request to fetch the raw HTML content
html_content = requests.get(url).text
# Parse HTML code for the entire site
soup = BeautifulSoup(html_content, "lxml")
#print(soup.prettify()) # print the parsed data of html
gdp = soup.find_all("table", attrs={"id": "stats_standard"})
print("Number of tables on site: ",len(gdp))
Output - 'Number of tables on site: 0'

I suggest you to use selenium for such scraping, its performance is very reliable.
This code will work for you:
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
option = Options()
option.add_argument('--headless')
url = 'https://fbref.com/en/comps/9/stats/Premier-League-Stats'
driver = webdriver.Chrome(options=option)
driver.get(url)
bs = BeautifulSoup(driver.page_source, 'html.parser')
gdp = bs.find_all('table', {'id': 'stats_standard'})
driver.quit()
print("Number of tables on site: ",len(gdp))
Output
Number of tables on site: 1

Can you find the table(s) without using attrs={"id": "stats_standard"}?
I have checked and indeed I cannot find any table whose ID is stats_standard (but there is one with ID stats_standard_sh, for example). So I guess you might be using the wrong ID.

Web Scraping on Dinamica JS loaded sites

I am doing a web scraping job of the following page: COVID, what I need to do is generate a csv of the table that appears on the page but is dynamically loaded with data for which I am using selenium. The problem is that even so I cannot find the tables with the code which is the following:
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
#url of the page we want to scrape
url = "https://saluddigital.ssch.gob.mx/covid/"
# initiating the webdriver. Parameter includes the path of the webdriver.
driver = webdriver.Firefox()
driver.get(url)
# this is just to ensure that the page is loaded
html = driver.page_source
soup = BeautifulSoup(html, "html.parser")
print(len(soup.find_all("table")))
driver.close()
driver.quit()
When I print the table I get 0 since it cannot find it.

I am also trying to extract and generate csv file with data. Hopes it help.
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
import csv
url = "https://saluddigital.ssch.gob.mx/covid/"
# initiating the webdriver. Parameter includes the path of the webdriver.
driver = webdriver.Chrome()
driver.get(url)
time.sleep(5) # delay for load properly
# # this is just to ensure that the page is loaded
html = driver.page_source
soup = BeautifulSoup(html, "html.parser")
table = soup.select_one('div.contenedor-general')
header = [[a.getText(strip=True,separator=' ')][0].split() for a in table.find_all('tr', {'class': 'header-table'})]
text1 = [t.text.strip().split() for t in soup.find_all('tr', {'class': 'ringlon-1'})]
text2 = [t.text.strip().split() for t in soup.find_all('tr', {'class': 'ringlon-2'})]
with open('outz.csv', 'w') as f:
wr = csv.writer(f, delimiter=',')
wr.writerow(header[0][1:])
for row in text1:
wr.writerow(row)
for row in text2:
wr.writerow(row)

It looks like you need to perform a simple GET to https://saluddigital.ssch.gob.mx/app/asincronos/jsonstats.ashx?getconteos=1 and parse JSON response.

Indexing multiple tables in BeautfulSoup

This page I want to parse - https://fbref.com/en/comps/9/gca/Premier-League-Stats
It has 2 tables, I am trying to get information from the second table, but it keeps displaying the first table every time I run this code.
from bs4 import BeautifulSoup
import requests
source = requests.get('https://fbref.com/en/comps/9/gca/Premier-League-Stats').text
soup = BeautifulSoup(source, 'lxml')
stattable = soup.find('table', class_= 'min_width sortable stats_table min_width shade_zero')[1]
print(stattable)
min_width sortable stats_table min_width shade_zero is the ID of the 'second' table.
It does not give me an error nor does it return anything. It's null.

Since the second table is dynamically generated, why not combine selenium, BeautifulSoup, and pandas to get what you want?
For example:
import time
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
options = Options()
options.headless = False
driver = webdriver.Chrome(options=options)
driver.get("https://fbref.com/en/comps/9/gca/Premier-League-Stats")
time.sleep(2)
soup = BeautifulSoup(driver.page_source, "html.parser").find("div", {"id": "div_stats_gca"})
driver.close()
df = pd.read_html(str(soup), skiprows=[0, 1])
df = pd.concat(df)
df.to_csv("data.csv", index=False)
This spits out a .csv file that, well, looks like that table you want. :)

The HTML you see when you do inspect element are generated using Javascript. However, the same classes are not available in the raw html that you get using the script.
I disabled Javascript for this site and I saw that the table is not visible.
You can try something like Selenium. There is good information in this question.

Print elements using dt class name selenium python

I am trying to write a simple scraper for Sales Navigator in Linkedin and this is the link I am trying to scrape . It has search results for specific filter options selected for account results.
The goal I am trying to achieve is to retrieve every company name among the search results. Upon inspecting the link elements carrying the company name (eg : Facile.it, AGT international), I see the following js script, showing the dt class name
<dt class="result-lockup__name">
<a id="ember208" href="/sales/company/2429831?_ntb=zreYu57eQo%2BSZiFskdWJqg%3D%3D" class="ember-view"> Facile.it
</a> </dt>
I basically want to retrieve those names and open the url represented in href.
It can be noted that all the company name links had the same dt class result-lockup__name. The following script is an attempt to collect the list of all company names displayed in the search result along with its elements.
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
import re
import pandas as pd
import os
def scrape_accounts(url):
url = "https://www.linkedin.com/sales/search/companycompanySize=E&geoIncluded=emea%3A0%2Ceurope%3A0&industryIncluded=6&keywords=AI&page=1&searchSessionId=zreYu57eQo%2BSZiFskdWJqg%3D%3D"
driver = webdriver.PhantomJS(executable_path='C:\\phantomjs\\bin\\phantomjs.exe')
#driver = webdriver.Firefox()
#driver.implicitly_wait(30)
driver.get(url)
search_results = []
search_results = driver.find_elements_by_class_name("result-lockup__name")
print(search_results)
if __name__ == "__main__":
scrape_accounts("lol")
however, the result prints an empty list. I am trying to learn scraping different parts of web page and different elements,and thus I am not sure if I got this correct. What would be the right way?

I'm afraid I can't get to the page that you're after, but I notice that you're importing beautiful soup but not using it.
Try:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
import re
import pandas as pd
import os
url = "https://www.linkedin.com/sales/search/companycompanySize=E&geoIncluded=emea%3A0%2Ceurope%3A0&industryIncluded=6&keywords=AI&page=1&searchSessionId=zreYu57eQo%2BSZiFskdWJqg%3D%3D"
def scrape_accounts(url = url):
driver = webdriver.PhantomJS(executable_path='C:\\phantomjs\\bin\\phantomjs.exe')
#driver = webdriver.Firefox()
#driver.implicitly_wait(30)
driver.get(url)
html = driver.find_element_by_tag_name('html').get_attribute('innerHTML')
soup = BeautifulSoup(html, 'html.parser')
search_results = soup.select('dt.result-lockup__name a')
for link in search_results:
print(link.text.strip(), link['href'])

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

How to get data from javascript rendered table using selenium in python - python

Related

Unable to scrape text using Selenium and BeautifulSoup

Can't identify a table when scraping

Web Scraping on Dinamica JS loaded sites

Indexing multiple tables in BeautfulSoup

Print elements using dt class name selenium python

Categories

Resources