How to get table into a Dataframe with BeautifullSoup - python

It seams that Beautifull Soup is not able to retrieve the info from a table.
What I am trying to do is to retrieve the table with header and save it to a dataframe in pands. Any help is really much appreciated.
import requests
from bs4 import BeautifulSoup
import pandas as pd
# Create an URL object
url = 'xxxx'
# Create object page
page = requests.get(url)
soup = BeautifulSoup(page.content, "html5lib")
data = soup.find_all("table", id="cve_table", attrs={"class": "table"})
print(len(data))
headers = []
for body in data:
print(body)
for item in body:
title = item.text
print(title)
headers.append(title)
print(headers)
All I got is this:
<table class="table cell-border table-striped table-condensed table-hover" id="cve_table">
<tbody></tbody>
</table>
['\n ', '', '\n\n ']

It seems, that the table is rendered by javascript, so when requests loads the HTML page, the table is empty. When investigating the page sources, it could be seen, that the table is rendered with the function called in the last script element. This function takes as a parameter the data structure needed to render the table. It can be extracted as follows:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import json
from collections import defaultdict
# Create an URL object
url = 'https://cve.rayvyn.net/rayvyn'
# Create object page
page = requests.get(url)
soup = BeautifulSoup(page.content, "html5lib")
dct = defaultdict(list)
script = soup.find(lambda tag: tag.name == "script" and "get_all_cve_data" in tag.text)
if script:
result = re.search('\((.*)\)', script.text)
text = result.group(1)
data = json.loads(json.loads(text))
for row in data:
dct['CVE ID'].append(row[0])
dct['Feed'].append(row[1])
dct['Date Modified'].append(row[2])
dct['Description'].append(row[3])
dct['Vector'].append(row[4])
dct['Vendor'].append(row[5])
dct['Product'].append(row[6])
dct['Advisory Link'].append(row[7])
else:
print('Script tag with function get_all_cve_data() not found')
df = pd.DataFrame(dct)
df
An alternative approach would be using selenium framework:
import re
import json
from selenium import webdriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from collections import defaultdict
# Create an URL object
url = 'https://cve.rayvyn.net/rayvyn'
# delay for selenium web driver wait
DELAY = 30
# create selenium driver
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
driver = webdriver.Chrome('<<path to chromedriver.exe>>', options = chrome_options)
# open web page
driver.get(url)
script = WebDriverWait(driver, DELAY).until(EC.presence_of_element_located((By.XPATH, "//script[contains(text(), 'get_all_cve_data')]")))
dct = defaultdict(list)
if script:
result = re.search('\((.*)\)', script.get_attribute('innerHTML'))
text = result.group(1)
data = json.loads(text)
print(data)
else:
print('Script tag with function get_all_cve_data() not found')
driver.quit()
data = json.loads(json.loads(text))
for row in data:
dct['CVE ID'].append(row[0])
dct['Feed'].append(row[1])
dct['Date Modified'].append(row[2])
dct['Description'].append(row[3])
dct['Vector'].append(row[4])
dct['Vendor'].append(row[5])
dct['Product'].append(row[6])
dct['Advisory Link'].append(row[7])
df = pd.DataFrame(dct)
df
Please note, that for using selenium there also will be selenium webdriver needed (as a separate executable). It will simulate browser behavior and (among other features) will wait for the javascript code on the page to be executed and HTML code to be rendered.

Related

Scraping web with python scrapy and BeautifulSoup? [duplicate]

I've tried to create a Web Scraper for CNN. My goal is to scrape all news articles within the search query. Sometimes I get an output for some of the scraped pages and sometimes it doesn't work at all.
I am using selenium and BeautifulSoup packages in Jupiter Notebook. I am iterating over the pages via the url parameters &page={}&from={}. I tried by.XPATH before and simply clicking the next button at the end of the page, but it gave me the same results.
Here's the code I'm using:
#0 ------------import libraries
import requests
from bs4 import BeautifulSoup
from bs4.element import Tag
import feedparser
import urllib
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pickle
import pandas as pd
#3 ------------CNN SCRAPER
#3.1 ----------Define Funktion
def CNN_Scraper(max_pages):
base = "https://edition.cnn.com/"
browser = webdriver.Chrome('C:/chromedriver_win32/chromedriver.exe')
load_content = browser.implicitly_wait(30)
base_url = 'https://edition.cnn.com/search?q=coronavirus&sort=newest&category=business,us,politics,world,opinion,health&size=100'
#-------------Define empty lists to be scraped
CNN_title = []
CNN_date = []
CNN_article = []
article_count = 0
#-------------iterate over pages and extract
for page in range(1, max_pages + 1):
print("Page %d" % page)
url= base_url + "&page=%d&from=%d" % (page, article_count)
browser.get(url)
load_content
soup = BeautifulSoup(browser.page_source,'lxml')
search_results = soup.find('div', {'class':'cnn-search__results-list'})
contents = search_results.find_all('div', {'class':'cnn-search__result-contents'})
for content in contents:
try:
title = content.find('h3').text
print(title)
link = content.find('a')
link_url = link['href']
date = content.find('div',{'class':'cnn-search__result-publish-date'}).text.strip()
article = content.find('div',{'class':'cnn-search__result-body'}).text
except:
print("loser")
continue
CNN_title.append(title)
CNN_date.append(date)
CNN_article.append(article)
article_count += 100
print("-----")
#-------------Save in DF
df = pd.DataFrame()
df['title'] = CNN_title
df['date'] = CNN_date
df['article'] = CNN_article
df['link']=CNN_link
return df
#print("Complete")
browser.quit()
#3.2 ----------Call Function - Scrape CNN and save pickled data
CNN_data = CNN_Scraper(2)
#CNN_data.to_pickle("CNN_data")
Call the back-end API directly. For more details check my previous answer
import requests
import json
def main(url):
with requests.Session() as req:
for item in range(1, 1000, 100):
r = req.get(url.format(item)).json()
for a in r['result']:
print("Headline: {}, Url: {}".format(
a['headline'], a['url']))
main("https://search.api.cnn.io/content?q=coronavirus&sort=newest&category=business,us,politics,world,opinion,health&size=100&from={}")

Having an issue while scraping the print preview page using selenium webdriver in python

Having an issue to scrape the print preview page and export it into the CSV file. When clicking on the Print Preview url instead of providing only text data it is also giving the source code in it.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
import urllib.request
from bs4 import BeautifulSoup
import csv
import os
from selenium import webdriver
from selenium.webdriver.support.select import Select
from selenium.webdriver.common.keys import Keys
import time
import pandas as pd
import os
from openpyxl.workbook import Workbook
from pandas import ExcelWriter
url = 'https://maharerait.mahaonline.gov.in'
chrome_path = r'C:/Users/User/AppData/Local/Programs/Python/Python36/Scripts/chromedriver.exe'
driver = webdriver.Chrome(executable_path=chrome_path)
driver.get(url)
WebDriverWait(driver,
20).until(EC.element_to_be_clickable((By.XPATH,"//div[#class='search-
pro-details']//a[contains(.,'Search Project Details')]"))).click()
Registered_Project_radio= WebDriverWait(driver,
10).until(EC.element_to_be_clickable((By.ID,"Promoter")))
driver.execute_script("arguments[0].click();",Registered_Project_radio)
Application = driver.find_element_by_id("CertiNo")
Application.send_keys("P50500000005")
Search = WebDriverWait(driver,
10).until(EC.element_to_be_clickable((By.ID,"btnSearch")))
driver.execute_script("arguments[0].click();",Search)
View = [item.get_attribute('href') for item in
driver.find_elements_by_tag_name("a")]
print(View)
print(View)
driver.get(View)
request = urllib.request.Request(View)
html = urllib.request.urlopen(request).read()
soup = BeautifulSoup(html , 'html.parser')
print(soup.text)
View = [item.get_attribute('href') for item in
driver.find_elements_by_tag_name("a")]
is list a list eg. ['https://maharerait.mahaonline.gov.in/PrintPreview/PrintPreview?q=MB0agrub1IW1Z63O5lldJdHpk6le6np5EB3HZ3jy8r7qPsLpYPdQwJzwE0S5LXG3fqQe%2fUe6HTGYmXstD%2bcYtATvmObra1R4xBa7L235mdTlmH0wHJPnps0ZXvbDMZxA0Hf9fxpOLM%2ba3Ad13hq9M1bp8Agvb%2bCLA3KOgpoYwr0%3d', None, None, None, None]
which is containt ulr and None element.
Replace your code after driver.execute_script("arguments[0].click();",Search) statement to below code:
View = [item.get_attribute('href') for item in driver.find_elements_by_tag_name("a") if
item.get_attribute('href') is not None]
for url in View:
request = urllib.request.Request(url)
html = urllib.request.urlopen(request).read()
soup = BeautifulSoup(html, 'html.parser')
divPInfo = soup.find("div", {"id": "DivPInfo"})
title = divPInfo.find("div", {'class': 'x_panel'}, recursive=False).find("div", {'class': 'x_title'}).find(
"h2").text.strip()
x_contentObject = divPInfo.find("div", {'class': 'x_panel'}, recursive=False).find_all("div", {'class': 'col-md-3'})
my_dict = {title: {x_contentObject[0].text.strip(): x_contentObject[1].text.strip()}}
print(my_dict)
O/P:
{'General Information': {'Information Type': 'Other Than Individual'}}
enter code here
driver.get(View)
div_2 = driver.find_element_by_xpath("//div[#id='fldFirm']").text
print(div_2)
table = pd.read_html(driver.page_source)
#print(table)
#df.to_csv("trial.csv", sep=',',index = False)
div_3 = driver.find_element_by_xpath("//div[#id='DivProject']").text
print(div_3)
file2 = open("MyFile2.txt","w")
file2.writelines(div_3)
From reference to above Code, I Want to Scrape the second heading (FSI Details)from the div of id = DivProject .But I am unable to get the Second heading.
enter code here
divPInfo2 = soup.find("div", {"id": "DivProject"})
Project_title1 = divPInfo2.find("div", {'class': 'x_panel'},
recursive=False).find("div", {'class': 'x_title'}).find(
"h2")[1].get_text(strip=True)
print(Project_title1)

Html Parser pulling from previous webpage

I have a script that loads a page and saves a bunch of data ids from multiple containers. I then want to open up new urls appending those said data ids onto the end of the urls. For each url I want to locate all the hrefs and compare them to a list of specific links and if any of them match I want to save that link and a few other details to a table.
I have managed to get it to open the url with the appended data id but when I try to search for elements in the new page it either pulls them from the first url that was parsed if I try to findAll from soup again or I constantly get this error when I try to run another html.parser.
ResultSet object has no attribute 'findAll'. You're probably treating a list of items like a single item. Did you call find_all() when you meant to call find()?
Is it not possible to run another parser or am I just doing something wrong?
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup as soup
from selenium.webdriver.common.action_chains import ActionChains
url = "http://csgo.exchange/id/76561197999004010#x"
driver = webdriver.Firefox()
driver.get(url)
import time
time.sleep(15)
html = driver.page_source
soup = soup(html, "html.parser")
containers = soup.findAll("div",{"class":"vItem"})
print(len(containers))
data_ids = [] # Make a list to hold the data-id's
for container in containers:
test = container.attrs["data-id"]
data_ids.append(test) # add data-id's to the list
print(str(test))
for id in data_ids:
url2 = "http://csgo.exchange/item/" + id
driver.get(url2)
import time
time.sleep(2)
soup2 = soup(html, "html.parser")
containers2 = soup2.findAll("div",{"class":"bar"})
print(str(containers2))
with open('scraped.txt', 'w', encoding="utf-8") as file:
for id in data_ids:
file.write(str(id)+'\n') # write every data-id to a new line
Not sure exactly what you want from each page. You should add waits. I add waits looking for hrefs in the flow history section of each page (if present). It should illustrate the idea.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
url = 'http://csgo.exchange/id/76561197999004010'
driver = webdriver.Chrome()
driver.get(url)
ids = [item.get_attribute('data-id') for item in WebDriverWait(driver,10).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "[data-id]")))]
results = []
baseURL = 'http://csgo.exchange/item/'
for id in ids:
url = baseURL + id
driver.get(url)
try:
flowHistory = [item.get_attribute('href') for item in WebDriverWait(driver,10).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "#tab-history-flow [href]")))]
results.append([id, flowHistory])
except:
print(url)
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
url = 'http://csgo.exchange/id/76561197999004010'
profile = webdriver.FirefoxProfile()
profile.set_preference("permissions.default.image", 2) # Block all images to load websites faster.
driver = webdriver.Firefox(firefox_profile=profile)
driver.get(url)
ids = [item.get_attribute('data-id') for item in WebDriverWait(driver,30).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "[data-id]")))]
results = []
baseURL = 'http://csgo.exchange/item/'
for id in ids:
url = baseURL + id
driver.get(url)
try:
pros = ['http://csgo.exchange/profiles/76561198149324950']
flowHistory = [item.get_attribute('href') for item in WebDriverWait(driver,3).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "#tab-history-flow [href]")))]
if flowHistory in pros:
results.append([url,flowHistory])
print(results)
except:
print()
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
urls = ['http://csgo.exchange/id/76561197999004010']
profile = webdriver.FirefoxProfile()
profile.set_preference("permissions.default.image", 2) # Block all images to load websites faster.
driver = webdriver.Firefox(firefox_profile=profile)
for url in urls:
driver.get(url)
ids = [item.get_attribute('data-id') for item in WebDriverWait(driver,30).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "[data-id]")))]
results = []
pros = ['http://csgo.exchange/profiles/76561198149324950', 'http://csgo.exchange/profiles/76561198152970370']
baseURL = 'http://csgo.exchange/item/'
for id in ids:
url = baseURL + id
driver.get(url)
try:
flowHistory = [item.get_attribute('href') for item in WebDriverWait(driver,2).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "#tab-history-flow [href]")))]
match = []
for string in pros:
if string in flowHistory:
match = string
break
if match:
pass
results.append([url,match])
print(results)
except:
print()

How to scrape review data present in Read more in Flipkart reviews

I am trying to scrape Flipkart to extract reviews for a product using request and beautifulsoup package.how can take out data present in Read more click event present in those review.
from selenium import webdriver
from selenium.webdriver.common.by import By
from contextlib import closing
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver import Firefox
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
import time
import urllib2
import re
from bs4 import BeautifulSoup
import unicodedata
def remove_non_ascii_1(text):
return ''.join([i if ord(i) < 128 else ' ' for i in text])
with closing(Firefox()) as browser:
site = "https://www.flipkart.com/asus-zenfone-2-laser-ze550kl-black-16-gb/product-reviews/itme9j58yzyzqzgc?pid=MOBE9J587QGMXBB7"
browser.get(site)
file = open("review.txt", "w")
for count in range(1, 10):
nav_btns = browser.find_elements_by_class_name('_33m_Yg')
button = ""
for btn in nav_btns:
number = int(btn.text)
if(number==count):
button = btn
break
button.send_keys(Keys.RETURN)
WebDriverWait(browser, timeout=10).until(EC.presence_of_all_elements_located((By.CLASS_NAME, "_2xg6Ul")))
read_more_btns = browser.find_elements_by_class_name('_1EPkIx')
for rm in read_more_btns:
browser.execute_script("return arguments[0].scrollIntoView();", rm)
browser.execute_script("window.scrollBy(0, -150);")
rm.click()
page_source = browser.page_source
soup = BeautifulSoup(page_source, "lxml")
ans = soup.find_all("div", class_="_3DCdKt")
for tag in ans:
title = unicode(tag.find("p", class_="_2xg6Ul").string).replace(u"\u2018", "'").replace(u"\u2019", "'")
title = remove_non_ascii_1(title)
title.encode('ascii','ignore')
content = tag.find("div", class_="qwjRop").div.prettify().replace(u"\u2018", "'").replace(u"\u2019", "'")
content = remove_non_ascii_1(content)
content.encode('ascii','ignore')
content = content[15:-7]
votes = tag.find_all("span", class_="_1_BQL8")
upvotes = int(votes[0].string)
downvotes = int(votes[1].string)
file.write("Review Title : %s\n\n" % title )
file.write("Upvotes : " + str(upvotes) + "\n\nDownvotes : " + str(downvotes) + "\n\n")
file.write("Review Content :\n%s\n\n\n\n" % content )
file.close()
Usage:
Install the requirements by running pip install bs4 selenium.
Add geckodriver to the PATH. Follow these instructions.
Put the link of the product in site variable inside the script.
Run the script by running python scrape.py.
Reviews will be saved in the file review.txt.
Had some issues using #CSMaverick code while accessing the READ MORE link. Modified the code as per my requirement.
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from bs4 import BeautifulSoup as bs
def get_source_code(browser):
rm_btns = browser.find_elements_by_class_name('_1BWGvX')
for rm_btn in rm_btns:
rm_btn.click()
return browser.page_source
def collect_reviews_attributes(html):
soup_obj = bs(html, "html.parser")
text_tag_divs = soup_obj.find_all('div', attrs={"class", "t-ZTKy"})
heading_tag_divs = soup_obj.find_all('p', attrs={"class", "_2-N8zT"})
rating_tag_divs = soup_obj.find_all('div', attrs={"class", "_3LWZlK _1BLPMq"})
text_tags = [tag.text for tag in text_tag_divs]
heading_tags = [tag.text for tag in heading_tag_divs]
rating_tags = [tag.text for tag in rating_tag_divs]
return list(zip(heading_tags, text_tags, rating_tags))
collector_list = []
browser = webdriver.Firefox(executable_path=r"path to\geckodriver.exe")
url = "https://www.flipkart.com/samsung-253-l-frost-free-double-door-3-star-convertible-refrigerator/product-reviews/itmf75fa1554bad3?pid=RFRFNDEEJ28SNQPG&lid=LSTRFRFNDEEJ28SNQPGEJ3YHJ&sortOrder=MOST_HELPFUL&certifiedBuyer=false&aid=overall"
num_pages = 3 # get from the url dynamically or else give large number and try hitting until u get exception
browser.get(url) # open the url in the browser
for _ in range(num_pages):
page_source_code = get_source_code(browser)
collector_list.extend(collect_reviews_attributes(page_source_code))
next_page = browser.find_elements_by_class_name('_1LKTO3')[-1] # previous and next are under same class. Access last element
next_page.click()

Selenium no another page

I am scraping one page but the problem i came up today was that the page didn`t have another page and it gave me the previous page without any error from which i could determine that page was last one..
for ex: https://example/page-7
when i want to go to: https://example/page-8 which doesn`t exist it gives me
the last page: https://example/page-7
How could i determine that https://example/page-7 was the last page using python3???
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
import urllib.request
page = "https://www.supermama.lt/forumas/topic/214375-vilma/page-1"
driver = webdriver.Firefox()
driver.get(page)
dd = driver.page_source
for j in range(100):
soup = BeautifulSoup(dd, "html.parser")
my_text = list(soup.findAll("div", class_ = "post-content"))
for i in my_text:
#collect some data
pass
page = "https://www.supermama.lt/forumas/topic/214375-vilma/page{0}".format(j+2)
driver.get(page)
dd = driver.page_source
At first i was thinking about checking dublicates of collected data but this is too slow cause i have 30 000 links from which i have to collect data. Maybe there is easier solution??
Found the answer to my own question.
To find the page url just use driver.current_url
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
import urllib.request
page = "https://www.supermama.lt/forumas/topic/214375-vilma/"
driver = webdriver.Firefox()
driver.get(page)
dd = driver.page_source
current_pages = []
for j in range(100):
page_url = driver.current_url
if(page_url not in current_pages):
current_pages.append(page_url)
soup = BeautifulSoup(dd, "html.parser")
my_text = list(soup.findAll("div", class_ = "post-content"))
for i in my_text:
#collect some data
pass
page = "https://www.supermama.lt/forumas/topic/214375-vilma/page-{0}".format(j+2)
driver.get(page)
dd = driver.page_source
else:
print(current_pages)
driver.quit()
break

Categories