I've tried to create a Web Scraper for CNN. My goal is to scrape all news articles within the search query. Sometimes I get an output for some of the scraped pages and sometimes it doesn't work at all.
I am using selenium and BeautifulSoup packages in Jupiter Notebook. I am iterating over the pages via the url parameters &page={}&from={}. I tried by.XPATH before and simply clicking the next button at the end of the page, but it gave me the same results.
Here's the code I'm using:
#0 ------------import libraries
import requests
from bs4 import BeautifulSoup
from bs4.element import Tag
import feedparser
import urllib
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pickle
import pandas as pd
#3 ------------CNN SCRAPER
#3.1 ----------Define Funktion
def CNN_Scraper(max_pages):
base = "https://edition.cnn.com/"
browser = webdriver.Chrome('C:/chromedriver_win32/chromedriver.exe')
load_content = browser.implicitly_wait(30)
base_url = 'https://edition.cnn.com/search?q=coronavirus&sort=newest&category=business,us,politics,world,opinion,health&size=100'
#-------------Define empty lists to be scraped
CNN_title = []
CNN_date = []
CNN_article = []
article_count = 0
#-------------iterate over pages and extract
for page in range(1, max_pages + 1):
print("Page %d" % page)
url= base_url + "&page=%d&from=%d" % (page, article_count)
browser.get(url)
load_content
soup = BeautifulSoup(browser.page_source,'lxml')
search_results = soup.find('div', {'class':'cnn-search__results-list'})
contents = search_results.find_all('div', {'class':'cnn-search__result-contents'})
for content in contents:
try:
title = content.find('h3').text
print(title)
link = content.find('a')
link_url = link['href']
date = content.find('div',{'class':'cnn-search__result-publish-date'}).text.strip()
article = content.find('div',{'class':'cnn-search__result-body'}).text
except:
print("loser")
continue
CNN_title.append(title)
CNN_date.append(date)
CNN_article.append(article)
article_count += 100
print("-----")
#-------------Save in DF
df = pd.DataFrame()
df['title'] = CNN_title
df['date'] = CNN_date
df['article'] = CNN_article
df['link']=CNN_link
return df
#print("Complete")
browser.quit()
#3.2 ----------Call Function - Scrape CNN and save pickled data
CNN_data = CNN_Scraper(2)
#CNN_data.to_pickle("CNN_data")
Call the back-end API directly. For more details check my previous answer
import requests
import json
def main(url):
with requests.Session() as req:
for item in range(1, 1000, 100):
r = req.get(url.format(item)).json()
for a in r['result']:
print("Headline: {}, Url: {}".format(
a['headline'], a['url']))
main("https://search.api.cnn.io/content?q=coronavirus&sort=newest&category=business,us,politics,world,opinion,health&size=100&from={}")
Related
It seams that Beautifull Soup is not able to retrieve the info from a table.
What I am trying to do is to retrieve the table with header and save it to a dataframe in pands. Any help is really much appreciated.
import requests
from bs4 import BeautifulSoup
import pandas as pd
# Create an URL object
url = 'xxxx'
# Create object page
page = requests.get(url)
soup = BeautifulSoup(page.content, "html5lib")
data = soup.find_all("table", id="cve_table", attrs={"class": "table"})
print(len(data))
headers = []
for body in data:
print(body)
for item in body:
title = item.text
print(title)
headers.append(title)
print(headers)
All I got is this:
<table class="table cell-border table-striped table-condensed table-hover" id="cve_table">
<tbody></tbody>
</table>
['\n ', '', '\n\n ']
It seems, that the table is rendered by javascript, so when requests loads the HTML page, the table is empty. When investigating the page sources, it could be seen, that the table is rendered with the function called in the last script element. This function takes as a parameter the data structure needed to render the table. It can be extracted as follows:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import json
from collections import defaultdict
# Create an URL object
url = 'https://cve.rayvyn.net/rayvyn'
# Create object page
page = requests.get(url)
soup = BeautifulSoup(page.content, "html5lib")
dct = defaultdict(list)
script = soup.find(lambda tag: tag.name == "script" and "get_all_cve_data" in tag.text)
if script:
result = re.search('\((.*)\)', script.text)
text = result.group(1)
data = json.loads(json.loads(text))
for row in data:
dct['CVE ID'].append(row[0])
dct['Feed'].append(row[1])
dct['Date Modified'].append(row[2])
dct['Description'].append(row[3])
dct['Vector'].append(row[4])
dct['Vendor'].append(row[5])
dct['Product'].append(row[6])
dct['Advisory Link'].append(row[7])
else:
print('Script tag with function get_all_cve_data() not found')
df = pd.DataFrame(dct)
df
An alternative approach would be using selenium framework:
import re
import json
from selenium import webdriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from collections import defaultdict
# Create an URL object
url = 'https://cve.rayvyn.net/rayvyn'
# delay for selenium web driver wait
DELAY = 30
# create selenium driver
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
driver = webdriver.Chrome('<<path to chromedriver.exe>>', options = chrome_options)
# open web page
driver.get(url)
script = WebDriverWait(driver, DELAY).until(EC.presence_of_element_located((By.XPATH, "//script[contains(text(), 'get_all_cve_data')]")))
dct = defaultdict(list)
if script:
result = re.search('\((.*)\)', script.get_attribute('innerHTML'))
text = result.group(1)
data = json.loads(text)
print(data)
else:
print('Script tag with function get_all_cve_data() not found')
driver.quit()
data = json.loads(json.loads(text))
for row in data:
dct['CVE ID'].append(row[0])
dct['Feed'].append(row[1])
dct['Date Modified'].append(row[2])
dct['Description'].append(row[3])
dct['Vector'].append(row[4])
dct['Vendor'].append(row[5])
dct['Product'].append(row[6])
dct['Advisory Link'].append(row[7])
df = pd.DataFrame(dct)
df
Please note, that for using selenium there also will be selenium webdriver needed (as a separate executable). It will simulate browser behavior and (among other features) will wait for the javascript code on the page to be executed and HTML code to be rendered.
Having an issue to scrape the print preview page and export it into the CSV file. When clicking on the Print Preview url instead of providing only text data it is also giving the source code in it.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
import urllib.request
from bs4 import BeautifulSoup
import csv
import os
from selenium import webdriver
from selenium.webdriver.support.select import Select
from selenium.webdriver.common.keys import Keys
import time
import pandas as pd
import os
from openpyxl.workbook import Workbook
from pandas import ExcelWriter
url = 'https://maharerait.mahaonline.gov.in'
chrome_path = r'C:/Users/User/AppData/Local/Programs/Python/Python36/Scripts/chromedriver.exe'
driver = webdriver.Chrome(executable_path=chrome_path)
driver.get(url)
WebDriverWait(driver,
20).until(EC.element_to_be_clickable((By.XPATH,"//div[#class='search-
pro-details']//a[contains(.,'Search Project Details')]"))).click()
Registered_Project_radio= WebDriverWait(driver,
10).until(EC.element_to_be_clickable((By.ID,"Promoter")))
driver.execute_script("arguments[0].click();",Registered_Project_radio)
Application = driver.find_element_by_id("CertiNo")
Application.send_keys("P50500000005")
Search = WebDriverWait(driver,
10).until(EC.element_to_be_clickable((By.ID,"btnSearch")))
driver.execute_script("arguments[0].click();",Search)
View = [item.get_attribute('href') for item in
driver.find_elements_by_tag_name("a")]
print(View)
print(View)
driver.get(View)
request = urllib.request.Request(View)
html = urllib.request.urlopen(request).read()
soup = BeautifulSoup(html , 'html.parser')
print(soup.text)
View = [item.get_attribute('href') for item in
driver.find_elements_by_tag_name("a")]
is list a list eg. ['https://maharerait.mahaonline.gov.in/PrintPreview/PrintPreview?q=MB0agrub1IW1Z63O5lldJdHpk6le6np5EB3HZ3jy8r7qPsLpYPdQwJzwE0S5LXG3fqQe%2fUe6HTGYmXstD%2bcYtATvmObra1R4xBa7L235mdTlmH0wHJPnps0ZXvbDMZxA0Hf9fxpOLM%2ba3Ad13hq9M1bp8Agvb%2bCLA3KOgpoYwr0%3d', None, None, None, None]
which is containt ulr and None element.
Replace your code after driver.execute_script("arguments[0].click();",Search) statement to below code:
View = [item.get_attribute('href') for item in driver.find_elements_by_tag_name("a") if
item.get_attribute('href') is not None]
for url in View:
request = urllib.request.Request(url)
html = urllib.request.urlopen(request).read()
soup = BeautifulSoup(html, 'html.parser')
divPInfo = soup.find("div", {"id": "DivPInfo"})
title = divPInfo.find("div", {'class': 'x_panel'}, recursive=False).find("div", {'class': 'x_title'}).find(
"h2").text.strip()
x_contentObject = divPInfo.find("div", {'class': 'x_panel'}, recursive=False).find_all("div", {'class': 'col-md-3'})
my_dict = {title: {x_contentObject[0].text.strip(): x_contentObject[1].text.strip()}}
print(my_dict)
O/P:
{'General Information': {'Information Type': 'Other Than Individual'}}
enter code here
driver.get(View)
div_2 = driver.find_element_by_xpath("//div[#id='fldFirm']").text
print(div_2)
table = pd.read_html(driver.page_source)
#print(table)
#df.to_csv("trial.csv", sep=',',index = False)
div_3 = driver.find_element_by_xpath("//div[#id='DivProject']").text
print(div_3)
file2 = open("MyFile2.txt","w")
file2.writelines(div_3)
From reference to above Code, I Want to Scrape the second heading (FSI Details)from the div of id = DivProject .But I am unable to get the Second heading.
enter code here
divPInfo2 = soup.find("div", {"id": "DivProject"})
Project_title1 = divPInfo2.find("div", {'class': 'x_panel'},
recursive=False).find("div", {'class': 'x_title'}).find(
"h2")[1].get_text(strip=True)
print(Project_title1)
I am trying to scrape products details from aliexpress. I have 2 questions. First, how do I scrape category and save it in csv file in front of each product and second, how do I move to the 2nd and other pages until there are no more pages available or until page 10.
This is the code I have written to find the next pages
from bs4 import BeautifulSoup
import requests as r
page = r.get('https://www.aliexpress.com/category/200000664/jackets.html?spm=2114.11010108.102.4.650c649b8lfPOb')
soup = BeautifulSoup(page.content,'html.parser')
content = soup.find(id="pagination-bottom")
pages = content.findAll('a')
for i in pages:
print('https:' + i.get('href'))
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
from bs4 import BeautifulSoup import urllib.request
filename = "alibaba.csv"
f=open(filename, "w")
headers="product_name, price, Rating \n"
f.write(headers)
class alibabascrape(object):
def __init__(self, keyword):
self.keyword = keyword
self.url = f"https://www.aliexpress.com/wholesale?catId=0&initiative_id=&SearchText={keyword}"
self.driver = webdriver.Firefox(executable_path = 'c:\geckodriver.exe')
self.delay = 3
def load_alibabalist_url(self):
self.driver.get(self.url)
try:
wait = WebDriverWait(self.driver, self.delay)
wait.until(EC.presence_of_all_elements_located((By.ID, "form-searchbar")))
print("page is ready")
except TimeoutException:
print("Too much Time")
def extract_post_information(self):
all_posts = self.driver.find_elements_by_class_name("list-item")
post_title_list = []
for post in all_posts:
title=post.text.split("\n")
name=title[0]
print(name)
price=title[2]
print(price)
rating = title[6]
print(rating)
f.write(name + "," + price + "," + rating + "\n")
post_title_list.append(post.text)
return post_title_list
def extract_category(self):
category = self.driver.find_elements_by_class_name("col-sub")
print(category)
def extract_post_urls(self):
url_list = []
html_page = urllib.request.urlopen(self.url)
soup = BeautifulSoup(html_page, "lxml")
for link in soup.findAll("a", {"class": "history-item product"}):
print(link["href"])
url_list.append(link["href"])
return url_list
keyword = "iphone"
scrapper = alibabascrape(keyword)
scrapper.load_alibabalist_url()
scrapper.extract_post_information()
scrapper.extract_category()
scrapper.extract_post_urls()
I can help you with pagination:
If you get all ref links then you can simply use for loop to iterate
all links.
If you just have prev or next page link. Then use while/ do while loop to check if the link exists and then click on it.
I am trying to scrape Flipkart to extract reviews for a product using request and beautifulsoup package.how can take out data present in Read more click event present in those review.
from selenium import webdriver
from selenium.webdriver.common.by import By
from contextlib import closing
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver import Firefox
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
import time
import urllib2
import re
from bs4 import BeautifulSoup
import unicodedata
def remove_non_ascii_1(text):
return ''.join([i if ord(i) < 128 else ' ' for i in text])
with closing(Firefox()) as browser:
site = "https://www.flipkart.com/asus-zenfone-2-laser-ze550kl-black-16-gb/product-reviews/itme9j58yzyzqzgc?pid=MOBE9J587QGMXBB7"
browser.get(site)
file = open("review.txt", "w")
for count in range(1, 10):
nav_btns = browser.find_elements_by_class_name('_33m_Yg')
button = ""
for btn in nav_btns:
number = int(btn.text)
if(number==count):
button = btn
break
button.send_keys(Keys.RETURN)
WebDriverWait(browser, timeout=10).until(EC.presence_of_all_elements_located((By.CLASS_NAME, "_2xg6Ul")))
read_more_btns = browser.find_elements_by_class_name('_1EPkIx')
for rm in read_more_btns:
browser.execute_script("return arguments[0].scrollIntoView();", rm)
browser.execute_script("window.scrollBy(0, -150);")
rm.click()
page_source = browser.page_source
soup = BeautifulSoup(page_source, "lxml")
ans = soup.find_all("div", class_="_3DCdKt")
for tag in ans:
title = unicode(tag.find("p", class_="_2xg6Ul").string).replace(u"\u2018", "'").replace(u"\u2019", "'")
title = remove_non_ascii_1(title)
title.encode('ascii','ignore')
content = tag.find("div", class_="qwjRop").div.prettify().replace(u"\u2018", "'").replace(u"\u2019", "'")
content = remove_non_ascii_1(content)
content.encode('ascii','ignore')
content = content[15:-7]
votes = tag.find_all("span", class_="_1_BQL8")
upvotes = int(votes[0].string)
downvotes = int(votes[1].string)
file.write("Review Title : %s\n\n" % title )
file.write("Upvotes : " + str(upvotes) + "\n\nDownvotes : " + str(downvotes) + "\n\n")
file.write("Review Content :\n%s\n\n\n\n" % content )
file.close()
Usage:
Install the requirements by running pip install bs4 selenium.
Add geckodriver to the PATH. Follow these instructions.
Put the link of the product in site variable inside the script.
Run the script by running python scrape.py.
Reviews will be saved in the file review.txt.
Had some issues using #CSMaverick code while accessing the READ MORE link. Modified the code as per my requirement.
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from bs4 import BeautifulSoup as bs
def get_source_code(browser):
rm_btns = browser.find_elements_by_class_name('_1BWGvX')
for rm_btn in rm_btns:
rm_btn.click()
return browser.page_source
def collect_reviews_attributes(html):
soup_obj = bs(html, "html.parser")
text_tag_divs = soup_obj.find_all('div', attrs={"class", "t-ZTKy"})
heading_tag_divs = soup_obj.find_all('p', attrs={"class", "_2-N8zT"})
rating_tag_divs = soup_obj.find_all('div', attrs={"class", "_3LWZlK _1BLPMq"})
text_tags = [tag.text for tag in text_tag_divs]
heading_tags = [tag.text for tag in heading_tag_divs]
rating_tags = [tag.text for tag in rating_tag_divs]
return list(zip(heading_tags, text_tags, rating_tags))
collector_list = []
browser = webdriver.Firefox(executable_path=r"path to\geckodriver.exe")
url = "https://www.flipkart.com/samsung-253-l-frost-free-double-door-3-star-convertible-refrigerator/product-reviews/itmf75fa1554bad3?pid=RFRFNDEEJ28SNQPG&lid=LSTRFRFNDEEJ28SNQPGEJ3YHJ&sortOrder=MOST_HELPFUL&certifiedBuyer=false&aid=overall"
num_pages = 3 # get from the url dynamically or else give large number and try hitting until u get exception
browser.get(url) # open the url in the browser
for _ in range(num_pages):
page_source_code = get_source_code(browser)
collector_list.extend(collect_reviews_attributes(page_source_code))
next_page = browser.find_elements_by_class_name('_1LKTO3')[-1] # previous and next are under same class. Access last element
next_page.click()
I am scraping one page but the problem i came up today was that the page didn`t have another page and it gave me the previous page without any error from which i could determine that page was last one..
for ex: https://example/page-7
when i want to go to: https://example/page-8 which doesn`t exist it gives me
the last page: https://example/page-7
How could i determine that https://example/page-7 was the last page using python3???
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
import urllib.request
page = "https://www.supermama.lt/forumas/topic/214375-vilma/page-1"
driver = webdriver.Firefox()
driver.get(page)
dd = driver.page_source
for j in range(100):
soup = BeautifulSoup(dd, "html.parser")
my_text = list(soup.findAll("div", class_ = "post-content"))
for i in my_text:
#collect some data
pass
page = "https://www.supermama.lt/forumas/topic/214375-vilma/page{0}".format(j+2)
driver.get(page)
dd = driver.page_source
At first i was thinking about checking dublicates of collected data but this is too slow cause i have 30 000 links from which i have to collect data. Maybe there is easier solution??
Found the answer to my own question.
To find the page url just use driver.current_url
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
import urllib.request
page = "https://www.supermama.lt/forumas/topic/214375-vilma/"
driver = webdriver.Firefox()
driver.get(page)
dd = driver.page_source
current_pages = []
for j in range(100):
page_url = driver.current_url
if(page_url not in current_pages):
current_pages.append(page_url)
soup = BeautifulSoup(dd, "html.parser")
my_text = list(soup.findAll("div", class_ = "post-content"))
for i in my_text:
#collect some data
pass
page = "https://www.supermama.lt/forumas/topic/214375-vilma/page-{0}".format(j+2)
driver.get(page)
dd = driver.page_source
else:
print(current_pages)
driver.quit()
break