loop with BeautifulSoup to web scrape multiple pages by timestamps - python

I am trying to retrive the day temperature of a local weather site.
I built this loop using BeautifulSoup.
Unfortunately the loop breaks after the first round.
this is my code and the result:
code:
#coding: latin-1
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
# create a file zam-data.txt
# seperated with komma
f = open('zamg-data.txt','w')
# start webdriver
driver = webdriver.Chrome("/usr/local/bin/chromedriver")
#loop through month and days
for m in range(1,13):
for d in range (1, 32):
# was the last day in a month
if (m==2 and d>28):
break
elif (m in [4,6,9,11] and d>30):
break
#open zamg site
timestamp = '2019' +'-'+ str(m) +'-'+ str(d)
print("call page of "+timestamp)
url = "https://www.zamg.ac.at/cms/de/klima/klima-aktuell/klimamonitoring/?param=t&period=period-ymd-"+timestamp
driver.get(url)
# extract temprature
html = driver.execute_script("return document.documentElement.outerHTML")
soup = BeautifulSoup(html, "html.parser")
data = soup.find_all(class_='u-txt--big')[1].string
print(len(data))
print(data + '...okay')
# format month for timestamp
if(len(str(m)) < 2):
mStamp = '0'+str(m)
else:
mStamp = str(m)
# format day for timestamp
if(len(str(d)) < 2):
dStamp = '0'+ str(d)
else:
dStamp = str(d)
# timestamp
timestamp = '2019' + mStamp + dStamp
# write time and value
f.write(timestamp + ',' + data + '\n')
# data is extracted - close
f.close()
my result:
➜ weather-app python get-data-02.py
call page of 2019-1-1
5
+3,9 ...okay
call page of 2019-1-2
Traceback (most recent call last):
File "get-data-02.py", line 37, in <module>
data = soup.find_all(class_='u-txt--big')[1].string
IndexError: list index out of range
➜ weather-app
I don't understand what is wrong here. the 2nd page is loaded in the browser but then it breaks
any Ideas?

#coding: latin-1
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import datetime
import time
base = datetime.datetime(2019,1,1).date()
date_list = [base + datetime.timedelta(days=x) for x in range(365)]
# start webdriver
driver = webdriver.Chrome("/usr/local/bin/chromedriver")
base_url = "https://www.zamg.ac.at/cms/de/klima/klima-aktuell/klimamonitoring/?param=t&period=period-ymd-"
with open('zamg-data.txt','w') as file:
for dt in date_list:
timestamp = dt.strftime("%Y-%m-%d")
print("call page of "+timestamp)
url = f"{base_url}{timestamp}"
driver.get(url)
WebDriverWait(driver, timeout=40).until(EC.presence_of_all_elements_located((By.CLASS_NAME, "u-txt--big")))
# extract temprature
html = driver.execute_script("return document.documentElement.outerHTML")
soup = BeautifulSoup(html, "html.parser")
data = soup.find_all(class_='u-txt--big')[1].string
print(len(data))
print(data + '...okay')
# timestamp
timestamp_1 = dt.strftime("%Y%m%d")
# write time and value
file.write(timestamp_1 + ',' + data + '\n')
time.sleep(3)
driver.quit()
print("Done!!!")
As someone from the comment section mentioned, you need to make the browser wait till all elements of that class are detected. I've added an explicit time delay after each page load so that the website is not overwhelmed with requests. It is a potential way to get your IP blocked. It's best to always use a context manager, whenever you can.

Related

Web - Scraping on Rotten Tomatoes --- **I want to be able to scrape over 100 movies, but right now I only have scraped three. **

I should be receiving 100 different movies and their movie name, source, rating, text review, and date in the data.head(). from the website rotten tomatoes.
from bs4 import BeautifulSoup
import re
import time
import requests
#!pip install selenium
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
movie_list = ['divergent', 'top_gun', 'pursuit_of_happiness']
with open(name + "_" + ".csv", 'w',encoding='utf-8') as fw:
for movie in movie_list:
pageLink = 'https://www.rottentomatoes.com/m/'+ movie +'/reviews/'
path = "/Users/name/desktop/chromedriver"
s = Service(path)
browser = webdriver.Chrome(service=s)
browser.get(pageLink)
pageNum = 10000
for p in range(0,pageNum):
print ('page',p+1)
page_source = browser.page_source
soup = BeautifulSoup(page_source, 'lxml')
reviews=soup.findAll('div', {'class':re.compile('review_table_row')})
for review in reviews:
rating,text,date='NA','NA','NA'
rating_info =review.find('div',{'class':re.compile("review_icon")})
if rating_info:
rating = rating_info.attrs["class"][3]
print(rating)
text_info =review.find('div',{'class':re.compile("the_review")})
if text_info:
text = text_info.text.strip()
print(text)
review_date =review.find('div',{'class':re.compile("review-date subtle small")})
if review_date:
date = review_date.text.strip()
print(date)
fw.write(rating+'\t'+text+'\t'+date+'\n')
# move to the next page by clicking on the "next" button with selenium
if p < pageNum:
browser.find_element(By.XPATH,'//button[#class="js-prev-next-paging-next btn prev-next-paging__button prev-next-paging__button-right"]').click()
time.sleep(2)
#<span class="prev-next-paging__button-text">Next</span>
browser.quit()
data = pd.read_csv("your_name.csv", delimiter= "\t", header = None)
data.columns = ['Movie', 'Source','Rating', 'Text_Review', 'Date']
data.head()
I was trying to do it manually, but I think there is a faster and more efficient way to do it by web scraping... however I am not sure how. maybe by using a link that contains the top 100 movies?

Scraping web with python scrapy and BeautifulSoup? [duplicate]

I've tried to create a Web Scraper for CNN. My goal is to scrape all news articles within the search query. Sometimes I get an output for some of the scraped pages and sometimes it doesn't work at all.
I am using selenium and BeautifulSoup packages in Jupiter Notebook. I am iterating over the pages via the url parameters &page={}&from={}. I tried by.XPATH before and simply clicking the next button at the end of the page, but it gave me the same results.
Here's the code I'm using:
#0 ------------import libraries
import requests
from bs4 import BeautifulSoup
from bs4.element import Tag
import feedparser
import urllib
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pickle
import pandas as pd
#3 ------------CNN SCRAPER
#3.1 ----------Define Funktion
def CNN_Scraper(max_pages):
base = "https://edition.cnn.com/"
browser = webdriver.Chrome('C:/chromedriver_win32/chromedriver.exe')
load_content = browser.implicitly_wait(30)
base_url = 'https://edition.cnn.com/search?q=coronavirus&sort=newest&category=business,us,politics,world,opinion,health&size=100'
#-------------Define empty lists to be scraped
CNN_title = []
CNN_date = []
CNN_article = []
article_count = 0
#-------------iterate over pages and extract
for page in range(1, max_pages + 1):
print("Page %d" % page)
url= base_url + "&page=%d&from=%d" % (page, article_count)
browser.get(url)
load_content
soup = BeautifulSoup(browser.page_source,'lxml')
search_results = soup.find('div', {'class':'cnn-search__results-list'})
contents = search_results.find_all('div', {'class':'cnn-search__result-contents'})
for content in contents:
try:
title = content.find('h3').text
print(title)
link = content.find('a')
link_url = link['href']
date = content.find('div',{'class':'cnn-search__result-publish-date'}).text.strip()
article = content.find('div',{'class':'cnn-search__result-body'}).text
except:
print("loser")
continue
CNN_title.append(title)
CNN_date.append(date)
CNN_article.append(article)
article_count += 100
print("-----")
#-------------Save in DF
df = pd.DataFrame()
df['title'] = CNN_title
df['date'] = CNN_date
df['article'] = CNN_article
df['link']=CNN_link
return df
#print("Complete")
browser.quit()
#3.2 ----------Call Function - Scrape CNN and save pickled data
CNN_data = CNN_Scraper(2)
#CNN_data.to_pickle("CNN_data")
Call the back-end API directly. For more details check my previous answer
import requests
import json
def main(url):
with requests.Session() as req:
for item in range(1, 1000, 100):
r = req.get(url.format(item)).json()
for a in r['result']:
print("Headline: {}, Url: {}".format(
a['headline'], a['url']))
main("https://search.api.cnn.io/content?q=coronavirus&sort=newest&category=business,us,politics,world,opinion,health&size=100&from={}")

Python: Find condition to break out of loop

The purpose of my code is to web scrape a table that has multiple pages.
So far, with the use of selenium & bs4, I've managed to do just that. However, I am having trouble breaking out of my loop seeing as the last page still has the 'next' button, as a result, the program keeps scraping the last page over and over.
from selenium import webdriver
from bs4 import BeautifulSoup as bs
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.keys import Keys
import csv
import datetime as dt
# website url
url = "https://poit.bolagsverket.se/poit/PublikSokKungorelse.do?method=redirect&forward=main.no.sidebar.sokresultat"
# website
driver = webdriver.Chrome()
driver.get(url)
# click sök kungörelse
driver.find_element_by_xpath('//*[#id="nav1-2"]').click()
# click avancerad sökning
driver.find_element_by_xpath('//*[#id="content"]/form/div[2]/a').click()
# select "annan period"
select = Select(driver.find_element_by_id('tidsperiod'))
select.select_by_value('6')
# select "skuldsanering"
select = Select(driver.find_element_by_id('amnesomrade'))
select.select_by_value('5')
# select "inledande av skuldsanering"
select = Select(driver.find_element_by_id('kungorelserubrik'))
select.select_by_value('29')
#calculate date
today = dt.date.today()
last_monday = str(today - dt.timedelta(days=7))
last_friday = str(today - dt.timedelta(days=3))
# insert search date
inputElement = driver.find_element_by_id("from")
inputElement.send_keys(last_monday)
inputElement = driver.find_element_by_id("tom")
inputElement.send_keys(last_friday)
# click on "sök"
driver.find_element_by_xpath('//*[#id="SokKungorelse"]').click()
#get updated url
html = driver.page_source
#scrape table
with open('skuldsanering.txt', 'w', encoding='utf-8') as r:
while True:
html = driver.page_source
soup = bs(html, 'html.parser')
table = soup.find('tbody')
table_rows = table.find_all('tr')
for tr in table_rows:
td = tr.find_all('td')
row = [i.get_text(strip=True) for i in td]
csv_writer = csv.writer(r)
csv_writer.writerows([row])
try:
driver.find_element_by_xpath('//*[#id="movenextTop"]').click()
soup = bs(html, 'html.parser')
except:
#insert condition to break out of loop
break
I was thinking perhaps maybe it would be possible to include a click counter and break out of the loop when the amount of clicks (x) equals y in "Page x of y"? If that's a good solution, how do I move forward? If not, what would be a better solution?
Thank you very much in advance!
The results page shows Page x of y, you can check if x==y each time and when it's true break the loop.
Here's the tag I'm talking about.
<em class="gotopagebuttons">Sida 17 av 17</em>
You can split the string or try regex to get both the page numbers and then compare them.
Hmm yeaaah, not really a fan of extracting the page number from raw text - but it seems to be the most convenient option - can't really think of another way of doing it. Try this:
def main():
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from bs4 import BeautifulSoup
import datetime as dt
import re
url = "https://poit.bolagsverket.se/poit/PublikSokKungorelse.do"
driver = webdriver.Chrome()
driver.get(url)
driver.find_element_by_xpath('//*[#id="nav1-2"]').click()
driver.find_element_by_xpath('//*[#id="content"]/form/div[2]/a').click()
select = Select(driver.find_element_by_id('tidsperiod'))
select.select_by_value('6')
select = Select(driver.find_element_by_id('amnesomrade'))
select.select_by_value('5')
select = Select(driver.find_element_by_id('kungorelserubrik'))
select.select_by_value('29')
today = dt.date.today()
last_monday = str(today - dt.timedelta(days=7))
last_friday = str(today - dt.timedelta(days=3))
inputElement = driver.find_element_by_id("from")
inputElement.send_keys(last_monday)
inputElement = driver.find_element_by_id("tom")
inputElement.send_keys(last_friday)
driver.find_element_by_xpath('//*[#id="SokKungorelse"]').click()
while True:
page = driver.page_source
soup = BeautifulSoup(page, "html.parser")
label = soup.find("em", {"class": "gotopagebuttons"}).get_text(strip=True)
pattern = "Sida (\d+) av (\d+)"
match = re.match(pattern, label)
assert match is not None
print(match.group())
for row in soup.find("tbody").find_all("tr"):
for td in row.find_all("td"):
text = td.get_text(strip=True)
print(" " * 4 + text)
print(end="\n\n")
if match.group(1) == match.group(2):
# No more pages
break
driver.find_element_by_xpath('//*[#id="movenextTop"]').click()
return 0
if __name__ == "__main__":
import sys
sys.exit(main())

how to scrapy information from one directory by selenium

scraping contact information from the directory site
I am scraping contact information from the directory site.
this is not a link
I need scrape by selenium. it needs 3 steps,
1. get the company url from website.
2. get all company url from next page/ all pages.
3. scrape all contact information such as company name, website, email. etc.
the code as below, but I face two problem.
# -*- coding: utf-8 -*-
from time import sleep
from scrapy import Spider
from selenium import webdriver
from scrapy.selector import Selector
from scrapy.http import Request
from selenium.common.exceptions import NoSuchElementException
import pandas as pd
results = list()
driver = webdriver.Chrome('D:\chromedriver_win32\chromedriver.exe')
MAX_PAGE_NUM = 2
for i in range(1, MAX_PAGE_NUM):
page_num = str(i)
url ="http://www.arabianbusinesscommunity.com/category/Industrial-Automation-Process-Control/" + page_num
driver.get(url)
sleep(5)
sel = Selector(text=driver.page_source)
companies = sel.xpath('//*[#id="categorypagehtml"]/div[1]/div[7]/ul/li/b//#href').extract()
for i in range(0, len(companies)):
print(companies[i])
results.append(companies[i])
print('---')
for result in results:
url1 = "http://www.arabianbusinesscommunity.com" +result
print(url1)
driver.get(url1)
sleep(5)
sel = Selector(text=driver.page_source)
name = sel.css('h2::text').extract_first()
country = sel.xpath('//*[#id="companypagehtml"]/div[1]/div[2]/ul[1]/li[1]/span[4]/text()').extract_first()
if country:
country = country.strip()
web = sel.xpath('//*[#id="companypagehtml"]/div[1]/div[2]/ul[1]/li[4]/a/#href').extract_first()
email = sel.xpath('//a[contains(#href, "mailto:")]/#href').extract_first()
records = []
records.append((web,email,country,name))
df = pd.DataFrame(records, columns=['web','email', 'country', 'name'])
I write the code as above, but I have two problem.
1. I only can get the last company information.
2.each time it is iteration from the loop, computer always click all urls that clicked before.
can anyone help solve the problem?
Here code to get all companies details from all pages:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Chrome()
baseUrl = "http://www.arabianbusinesscommunity.com/category/Industrial-Automation-Process-Control"
driver.get(baseUrl)
wait = WebDriverWait(driver, 5)
wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, ".search-result-list li")))
# Get last page number
lastPageHref = driver.find_element(By.CSS_SELECTOR, ".PagedList-skipToLast a").get_attribute("href")
hrefArray = lastPageHref.split("/")
lastPageNum = int(hrefArray[len(hrefArray) - 1])
# Get all URLs for the first page and save them in companyUrls list
js = 'return [...document.querySelectorAll(".search-result-list li b a")].map(e=>e.href)'
companyUrls = driver.execute_script(js)
# Iterate through all pages and get all companies URLs
for i in range(2, lastPageNum):
driver.get(baseUrl + "/" + str(i))
companyUrls.extend(driver.execute_script(js))
# Open each company page and get all details
companies = []
for url in companyUrls:
driver.get(url)
company = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "#companypagehtml")))
name = company.find_element_by_css_selector("h2").text
email = driver.execute_script('var e = document.querySelector(".email"); if (e!=null) { return e.textContent;} return "";')
website = driver.execute_script('var e = document.querySelector(".website"); if (e!=null) { return e.textContent;} return "";')
phone = driver.execute_script('var e = document.querySelector(".phone"); if (e!=null) { return e.textContent;} return "";')
fax = driver.execute_script('var e = document.querySelector(".fax"); if (e!=null) { return e.textContent;} return "";')
country = company.find_element_by_xpath(".//li[#class='location']/span[last()]").text.replace(",", "").strip()
address = ''.join([e.text.strip() for e in company.find_elements_by_xpath(".//li[#class='location']/span[position() != last()]")])

How to scrape review data present in Read more in Flipkart reviews

I am trying to scrape Flipkart to extract reviews for a product using request and beautifulsoup package.how can take out data present in Read more click event present in those review.
from selenium import webdriver
from selenium.webdriver.common.by import By
from contextlib import closing
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver import Firefox
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
import time
import urllib2
import re
from bs4 import BeautifulSoup
import unicodedata
def remove_non_ascii_1(text):
return ''.join([i if ord(i) < 128 else ' ' for i in text])
with closing(Firefox()) as browser:
site = "https://www.flipkart.com/asus-zenfone-2-laser-ze550kl-black-16-gb/product-reviews/itme9j58yzyzqzgc?pid=MOBE9J587QGMXBB7"
browser.get(site)
file = open("review.txt", "w")
for count in range(1, 10):
nav_btns = browser.find_elements_by_class_name('_33m_Yg')
button = ""
for btn in nav_btns:
number = int(btn.text)
if(number==count):
button = btn
break
button.send_keys(Keys.RETURN)
WebDriverWait(browser, timeout=10).until(EC.presence_of_all_elements_located((By.CLASS_NAME, "_2xg6Ul")))
read_more_btns = browser.find_elements_by_class_name('_1EPkIx')
for rm in read_more_btns:
browser.execute_script("return arguments[0].scrollIntoView();", rm)
browser.execute_script("window.scrollBy(0, -150);")
rm.click()
page_source = browser.page_source
soup = BeautifulSoup(page_source, "lxml")
ans = soup.find_all("div", class_="_3DCdKt")
for tag in ans:
title = unicode(tag.find("p", class_="_2xg6Ul").string).replace(u"\u2018", "'").replace(u"\u2019", "'")
title = remove_non_ascii_1(title)
title.encode('ascii','ignore')
content = tag.find("div", class_="qwjRop").div.prettify().replace(u"\u2018", "'").replace(u"\u2019", "'")
content = remove_non_ascii_1(content)
content.encode('ascii','ignore')
content = content[15:-7]
votes = tag.find_all("span", class_="_1_BQL8")
upvotes = int(votes[0].string)
downvotes = int(votes[1].string)
file.write("Review Title : %s\n\n" % title )
file.write("Upvotes : " + str(upvotes) + "\n\nDownvotes : " + str(downvotes) + "\n\n")
file.write("Review Content :\n%s\n\n\n\n" % content )
file.close()
Usage:
Install the requirements by running pip install bs4 selenium.
Add geckodriver to the PATH. Follow these instructions.
Put the link of the product in site variable inside the script.
Run the script by running python scrape.py.
Reviews will be saved in the file review.txt.
Had some issues using #CSMaverick code while accessing the READ MORE link. Modified the code as per my requirement.
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from bs4 import BeautifulSoup as bs
def get_source_code(browser):
rm_btns = browser.find_elements_by_class_name('_1BWGvX')
for rm_btn in rm_btns:
rm_btn.click()
return browser.page_source
def collect_reviews_attributes(html):
soup_obj = bs(html, "html.parser")
text_tag_divs = soup_obj.find_all('div', attrs={"class", "t-ZTKy"})
heading_tag_divs = soup_obj.find_all('p', attrs={"class", "_2-N8zT"})
rating_tag_divs = soup_obj.find_all('div', attrs={"class", "_3LWZlK _1BLPMq"})
text_tags = [tag.text for tag in text_tag_divs]
heading_tags = [tag.text for tag in heading_tag_divs]
rating_tags = [tag.text for tag in rating_tag_divs]
return list(zip(heading_tags, text_tags, rating_tags))
collector_list = []
browser = webdriver.Firefox(executable_path=r"path to\geckodriver.exe")
url = "https://www.flipkart.com/samsung-253-l-frost-free-double-door-3-star-convertible-refrigerator/product-reviews/itmf75fa1554bad3?pid=RFRFNDEEJ28SNQPG&lid=LSTRFRFNDEEJ28SNQPGEJ3YHJ&sortOrder=MOST_HELPFUL&certifiedBuyer=false&aid=overall"
num_pages = 3 # get from the url dynamically or else give large number and try hitting until u get exception
browser.get(url) # open the url in the browser
for _ in range(num_pages):
page_source_code = get_source_code(browser)
collector_list.extend(collect_reviews_attributes(page_source_code))
next_page = browser.find_elements_by_class_name('_1LKTO3')[-1] # previous and next are under same class. Access last element
next_page.click()

Categories