How to clean this webscrape up - python

I want to scrape a table from a webpage, but there are two tables with the same tag.
The table I am interested in is "Event Timeline."
My problem is my code prints my desired table as a whole, and does not separate by column/row.
Ideally I would want this to be broken up per field.
Is there a way to clean this scrape up
from selenium import webdriver
import time
driver = webdriver.Chrome()
import pandas as pd
val=[]
driver.get('https://www.aan.com/MSA/Public/Events/Details/13419')
page_source = driver.page_source
element2=driver.find_element_by_tag_name('tbody').text.strip()
print(element2)

Selenium's purpose is more on web automation, therefore I will answer your question using a web scraping package BeautifulSoup instead.
This answer obtain the page's HTML file using your code, but a more efficient solution will be the Request package.
from selenium import webdriver
from bs4 import BeautifulSoup
import time
import pandas as pd
driver = webdriver.Chrome()
val = []
# Suggest using the Request package to obtain the HTML source code
driver.get('https://www.aan.com/MSA/Public/Events/Details/13419')
page_source = driver.page_source
# element2 = driver.find_element_by_tag_name('tbody')
# Declare a BeautifulSoup Object
soup = BeautifulSoup(driver.page_source, 'html.parser')
tbody = soup.find("tbody") #Find the first tbody
rows = tbody.find_all("tr") #Find all the rows
for row in rows:
rowVal = [] #Create an array to store the value
tds = row.find_all("td") #Find all the cells in the row
for td in tds:
rowVal.append(td.get_text().strip()) #Obtain text of the cell
print(rowVal) #Print them, or do anything else

Related

Can't identify a table when scraping

Beginner question.. I'm attempting to scrape data from a table but I can't seem to recognize it, I've tried using the class and the id to identify it but my result is 0. The code and output are below.
# Import necessary packages
from bs4 import BeautifulSoup
import requests
import pandas as pd
import re
# Site URL
url="https://fbref.com/en/comps/9/stats/Premier-League-Stats"
# Make a GET request to fetch the raw HTML content
html_content = requests.get(url).text
# Parse HTML code for the entire site
soup = BeautifulSoup(html_content, "lxml")
#print(soup.prettify()) # print the parsed data of html
gdp = soup.find_all("table", attrs={"id": "stats_standard"})
print("Number of tables on site: ",len(gdp))
Output - 'Number of tables on site: 0'
I suggest you to use selenium for such scraping, its performance is very reliable.
This code will work for you:
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
option = Options()
option.add_argument('--headless')
url = 'https://fbref.com/en/comps/9/stats/Premier-League-Stats'
driver = webdriver.Chrome(options=option)
driver.get(url)
bs = BeautifulSoup(driver.page_source, 'html.parser')
gdp = bs.find_all('table', {'id': 'stats_standard'})
driver.quit()
print("Number of tables on site: ",len(gdp))
Output
Number of tables on site: 1
Can you find the table(s) without using attrs={"id": "stats_standard"}?
I have checked and indeed I cannot find any table whose ID is stats_standard (but there is one with ID stats_standard_sh, for example). So I guess you might be using the wrong ID.

Web Scraping on Dinamica JS loaded sites

I am doing a web scraping job of the following page: COVID, what I need to do is generate a csv of the table that appears on the page but is dynamically loaded with data for which I am using selenium. The problem is that even so I cannot find the tables with the code which is the following:
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
#url of the page we want to scrape
url = "https://saluddigital.ssch.gob.mx/covid/"
# initiating the webdriver. Parameter includes the path of the webdriver.
driver = webdriver.Firefox()
driver.get(url)
# this is just to ensure that the page is loaded
html = driver.page_source
soup = BeautifulSoup(html, "html.parser")
print(len(soup.find_all("table")))
driver.close()
driver.quit()
When I print the table I get 0 since it cannot find it.
I am also trying to extract and generate csv file with data. Hopes it help.
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
import csv
url = "https://saluddigital.ssch.gob.mx/covid/"
# initiating the webdriver. Parameter includes the path of the webdriver.
driver = webdriver.Chrome()
driver.get(url)
time.sleep(5) # delay for load properly
# # this is just to ensure that the page is loaded
html = driver.page_source
soup = BeautifulSoup(html, "html.parser")
table = soup.select_one('div.contenedor-general')
header = [[a.getText(strip=True,separator=' ')][0].split() for a in table.find_all('tr', {'class': 'header-table'})]
text1 = [t.text.strip().split() for t in soup.find_all('tr', {'class': 'ringlon-1'})]
text2 = [t.text.strip().split() for t in soup.find_all('tr', {'class': 'ringlon-2'})]
with open('outz.csv', 'w') as f:
wr = csv.writer(f, delimiter=',')
wr.writerow(header[0][1:])
for row in text1:
wr.writerow(row)
for row in text2:
wr.writerow(row)
It looks like you need to perform a simple GET to https://saluddigital.ssch.gob.mx/app/asincronos/jsonstats.ashx?getconteos=1 and parse JSON response.

Indexing multiple tables in BeautfulSoup

This page I want to parse - https://fbref.com/en/comps/9/gca/Premier-League-Stats
It has 2 tables, I am trying to get information from the second table, but it keeps displaying the first table every time I run this code.
from bs4 import BeautifulSoup
import requests
source = requests.get('https://fbref.com/en/comps/9/gca/Premier-League-Stats').text
soup = BeautifulSoup(source, 'lxml')
stattable = soup.find('table', class_= 'min_width sortable stats_table min_width shade_zero')[1]
print(stattable)
min_width sortable stats_table min_width shade_zero is the ID of the 'second' table.
It does not give me an error nor does it return anything. It's null.
Since the second table is dynamically generated, why not combine selenium, BeautifulSoup, and pandas to get what you want?
For example:
import time
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
options = Options()
options.headless = False
driver = webdriver.Chrome(options=options)
driver.get("https://fbref.com/en/comps/9/gca/Premier-League-Stats")
time.sleep(2)
soup = BeautifulSoup(driver.page_source, "html.parser").find("div", {"id": "div_stats_gca"})
driver.close()
df = pd.read_html(str(soup), skiprows=[0, 1])
df = pd.concat(df)
df.to_csv("data.csv", index=False)
This spits out a .csv file that, well, looks like that table you want. :)
The HTML you see when you do inspect element are generated using Javascript. However, the same classes are not available in the raw html that you get using the script.
I disabled Javascript for this site and I saw that the table is not visible.
You can try something like Selenium. There is good information in this question.

How to get data from javascript rendered table using selenium in python

I have a website to scrape and i am using selenium to do it. When i finished writing the code, i noticed that i was not getting output at all when i print the table contents. I viewed the page source and then i found out that the table was not in the source. That is why even i find the xpath of the table from inspect element i cant get any output of it. Do someone know how could I get the response/data or just printing the table from the javascript response? Thanks.
Here is my current code
from bs4 import BeautifulSoup
from selenium import webdriver
import time
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
options = Options()
options.add_argument('--incognito')
chrome_path = r"C:\chromedriver.exe"
driver = webdriver.Chrome(chrome_path,options=options)
driver.implicitly_wait(3)
url = "https://reversewhois.domaintools.com/?refine#q=
%5B%5B%5B%22whois%22%2C%222%22%2C%22VerifiedID%40SG-Mandatory%22%5D%5D%5D"
driver.get(url)
html = driver.page_source
soup = BeautifulSoup(html,'lxml')
#These line of codes is for selecting the desired search parameter from the combo box, you can disregard this since i was putting the whole url with params
input = driver.find_element_by_xpath('//*[#id="q0"]/div[2]/div/div[1]/div[3]/input')
driver.find_element_by_xpath('//*[#id="q0"]/div[2]/div/div[1]/div[1]/div').click()
driver.find_element_by_xpath('//*[#id="q0"]/div[2]/div/div[1]/div[5]/div[1]/div/div[3]').click()
driver.find_element_by_xpath('//*[#id="q0"]/div[2]/div/div[1]/div[2]/div/div[1]').click()
driver.find_element_by_xpath('//*[#id="q0"]/div[2]/div/div[1]/div[6]/div[1]/div/div[1]').click
input.send_keys("VerifiedID#SG-Mandatory")
driver.find_element_by_xpath('//*[#id="search-button-container"]/button').click()
table = driver.find_elements_by_xpath('//*[#id="refine-preview-content"]/table/tbody/tr/td')
for i in table:
print(i) no output
I just want to scrape all the domain names like in the first result like 0 _ _ .sg
You can try the below code. After you have selected all the details options to fill and click on the search button it is kind of an implicit wait to make sure we get the full page source. Then we used the read_html from pandas which searches for any tables present in the html and returns a list of dataframe. we take the required df from there.
from selenium import webdriver
import time
from selenium.webdriver.chrome.options import Options
import pandas as pd
options = Options()
options.add_argument('--incognito')
chrome_path = r"C:/Users/prakh/Documents/PythonScripts/chromedriver.exe"
driver = webdriver.Chrome(chrome_path,options=options)
driver.implicitly_wait(3)
url = "https://reversewhois.domaintools.com/?refine#q=%5B%5B%5B%22whois%22%2C%222%22%2C%22VerifiedID%40SG-Mandatory%22%5D%5D%5D"
driver.get(url)
#html = driver.page_source
#soup = BeautifulSoup(html,'lxml')
#These line of codes is for selecting the desired search parameter from the combo box
input = driver.find_element_by_xpath('//*[#id="q0"]/div[2]/div/div[1]/div[3]/input')
driver.find_element_by_xpath('//*[#id="q0"]/div[2]/div/div[1]/div[1]/div').click()
driver.find_element_by_xpath('//*[#id="q0"]/div[2]/div/div[1]/div[5]/div[1]/div/div[3]').click()
driver.find_element_by_xpath('//*[#id="q0"]/div[2]/div/div[1]/div[2]/div/div[1]').click()
driver.find_element_by_xpath('//*[#id="q0"]/div[2]/div/div[1]/div[6]/div[1]/div/div[1]').click
input.send_keys("VerifiedID#SG-Mandatory")
driver.find_element_by_xpath('//*[#id="search-button-container"]/button').click()
time.sleep(5)
html = driver.page_source
tables = pd.read_html(html)
df = tables[-1]
print(df)
If you are open to other ways does the following give the expected results? It mimics the xhr the page does (though I have trimmed it down to essential elements only) to retrieve the lookup results. Faster than using a browser.
from bs4 import BeautifulSoup as bs
import requests
import pandas as pd
headers = {'User-Agent': 'Mozilla/5.0'}
r = requests.get('https://reversewhois.domaintools.com/?ajax=mReverseWhois&call=ajaxUpdateRefinePreview&q=[[[%22whois%22,%222%22,%22VerifiedID#SG-Mandatory%22]]]&sf=true', headers=headers)
table = pd.read_html(r.json()['results'])
print(table)

Python BeautifulSoup - trouble parsing table from webpage

I'd like to parse the table data from the following site:
Pricing data and create a dataframe with all of the table values (vCPU, Memory, Storage, Price). However, with the following code, I can't seem to find the table on the page. Can someone help me figure out how to parse out the values?
Using the pd.read_html, an error shows up that no tables are found.
import pandas as pd
from bs4 import BeautifulSoup
import requests
import csv
url = "https://aws.amazon.com/ec2/pricing/on-demand/"
r = requests.get(url)
html_content = r.text
soup = BeautifulSoup(html_content, 'html.parser')
data=[]
tables = soup.find_all('table')
df = pd.read_html(url)
If your having trouble because of dynamic content a good work around is selenium, it simulates browser experience so you dont have to worry about managing cookies and other problems that come with dynamic web content. I was able to scrape the page with the following:
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from time import sleep
driver = webdriver.Firefox()
driver.get('https://aws.amazon.com/ec2/pricing/on-demand/')
sleep(3)
html = driver.page_source
soup = BeautifulSoup(html,'lxml')
driver.close()
data=[]
tables = soup.find_all('table')
print(tables)

Categories