The purpose of my code is to web scrape a table that has multiple pages.
So far, with the use of selenium & bs4, I've managed to do just that. However, I am having trouble breaking out of my loop seeing as the last page still has the 'next' button, as a result, the program keeps scraping the last page over and over.
from selenium import webdriver
from bs4 import BeautifulSoup as bs
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.keys import Keys
import csv
import datetime as dt
# website url
url = "https://poit.bolagsverket.se/poit/PublikSokKungorelse.do?method=redirect&forward=main.no.sidebar.sokresultat"
# website
driver = webdriver.Chrome()
driver.get(url)
# click sök kungörelse
driver.find_element_by_xpath('//*[#id="nav1-2"]').click()
# click avancerad sökning
driver.find_element_by_xpath('//*[#id="content"]/form/div[2]/a').click()
# select "annan period"
select = Select(driver.find_element_by_id('tidsperiod'))
select.select_by_value('6')
# select "skuldsanering"
select = Select(driver.find_element_by_id('amnesomrade'))
select.select_by_value('5')
# select "inledande av skuldsanering"
select = Select(driver.find_element_by_id('kungorelserubrik'))
select.select_by_value('29')
#calculate date
today = dt.date.today()
last_monday = str(today - dt.timedelta(days=7))
last_friday = str(today - dt.timedelta(days=3))
# insert search date
inputElement = driver.find_element_by_id("from")
inputElement.send_keys(last_monday)
inputElement = driver.find_element_by_id("tom")
inputElement.send_keys(last_friday)
# click on "sök"
driver.find_element_by_xpath('//*[#id="SokKungorelse"]').click()
#get updated url
html = driver.page_source
#scrape table
with open('skuldsanering.txt', 'w', encoding='utf-8') as r:
while True:
html = driver.page_source
soup = bs(html, 'html.parser')
table = soup.find('tbody')
table_rows = table.find_all('tr')
for tr in table_rows:
td = tr.find_all('td')
row = [i.get_text(strip=True) for i in td]
csv_writer = csv.writer(r)
csv_writer.writerows([row])
try:
driver.find_element_by_xpath('//*[#id="movenextTop"]').click()
soup = bs(html, 'html.parser')
except:
#insert condition to break out of loop
break
I was thinking perhaps maybe it would be possible to include a click counter and break out of the loop when the amount of clicks (x) equals y in "Page x of y"? If that's a good solution, how do I move forward? If not, what would be a better solution?
Thank you very much in advance!
The results page shows Page x of y, you can check if x==y each time and when it's true break the loop.
Here's the tag I'm talking about.
<em class="gotopagebuttons">Sida 17 av 17</em>
You can split the string or try regex to get both the page numbers and then compare them.
Hmm yeaaah, not really a fan of extracting the page number from raw text - but it seems to be the most convenient option - can't really think of another way of doing it. Try this:
def main():
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from bs4 import BeautifulSoup
import datetime as dt
import re
url = "https://poit.bolagsverket.se/poit/PublikSokKungorelse.do"
driver = webdriver.Chrome()
driver.get(url)
driver.find_element_by_xpath('//*[#id="nav1-2"]').click()
driver.find_element_by_xpath('//*[#id="content"]/form/div[2]/a').click()
select = Select(driver.find_element_by_id('tidsperiod'))
select.select_by_value('6')
select = Select(driver.find_element_by_id('amnesomrade'))
select.select_by_value('5')
select = Select(driver.find_element_by_id('kungorelserubrik'))
select.select_by_value('29')
today = dt.date.today()
last_monday = str(today - dt.timedelta(days=7))
last_friday = str(today - dt.timedelta(days=3))
inputElement = driver.find_element_by_id("from")
inputElement.send_keys(last_monday)
inputElement = driver.find_element_by_id("tom")
inputElement.send_keys(last_friday)
driver.find_element_by_xpath('//*[#id="SokKungorelse"]').click()
while True:
page = driver.page_source
soup = BeautifulSoup(page, "html.parser")
label = soup.find("em", {"class": "gotopagebuttons"}).get_text(strip=True)
pattern = "Sida (\d+) av (\d+)"
match = re.match(pattern, label)
assert match is not None
print(match.group())
for row in soup.find("tbody").find_all("tr"):
for td in row.find_all("td"):
text = td.get_text(strip=True)
print(" " * 4 + text)
print(end="\n\n")
if match.group(1) == match.group(2):
# No more pages
break
driver.find_element_by_xpath('//*[#id="movenextTop"]').click()
return 0
if __name__ == "__main__":
import sys
sys.exit(main())
Related
I should be receiving 100 different movies and their movie name, source, rating, text review, and date in the data.head(). from the website rotten tomatoes.
from bs4 import BeautifulSoup
import re
import time
import requests
#!pip install selenium
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
movie_list = ['divergent', 'top_gun', 'pursuit_of_happiness']
with open(name + "_" + ".csv", 'w',encoding='utf-8') as fw:
for movie in movie_list:
pageLink = 'https://www.rottentomatoes.com/m/'+ movie +'/reviews/'
path = "/Users/name/desktop/chromedriver"
s = Service(path)
browser = webdriver.Chrome(service=s)
browser.get(pageLink)
pageNum = 10000
for p in range(0,pageNum):
print ('page',p+1)
page_source = browser.page_source
soup = BeautifulSoup(page_source, 'lxml')
reviews=soup.findAll('div', {'class':re.compile('review_table_row')})
for review in reviews:
rating,text,date='NA','NA','NA'
rating_info =review.find('div',{'class':re.compile("review_icon")})
if rating_info:
rating = rating_info.attrs["class"][3]
print(rating)
text_info =review.find('div',{'class':re.compile("the_review")})
if text_info:
text = text_info.text.strip()
print(text)
review_date =review.find('div',{'class':re.compile("review-date subtle small")})
if review_date:
date = review_date.text.strip()
print(date)
fw.write(rating+'\t'+text+'\t'+date+'\n')
# move to the next page by clicking on the "next" button with selenium
if p < pageNum:
browser.find_element(By.XPATH,'//button[#class="js-prev-next-paging-next btn prev-next-paging__button prev-next-paging__button-right"]').click()
time.sleep(2)
#<span class="prev-next-paging__button-text">Next</span>
browser.quit()
data = pd.read_csv("your_name.csv", delimiter= "\t", header = None)
data.columns = ['Movie', 'Source','Rating', 'Text_Review', 'Date']
data.head()
I was trying to do it manually, but I think there is a faster and more efficient way to do it by web scraping... however I am not sure how. maybe by using a link that contains the top 100 movies?
anyone can help with scraping from https://www.whed.net/home.php
the code I'm using is giving me empty df. would love to have universities with websites and maybe field of study. My scraping skills are weak so if you can guide me through this would be great thanks guys.
begin=time.time()
countries=['Emirates','United States of America (all)']
result = [] # List to store all data
univ_links=[] # Links for all universities
fields = ['Street:','City:','Province:','Post Code:','WWW:','Fields of study:','Job title:']
webD = wb.Chrome(executable_path=r'C:\Users\Admin\OneDrive\Sagasit\chromedriver.exe') # To launch chrome and run script
# Trigger the target website
webD.get("https://www.whed.net/results_institutions.php")
webD.implicitly_wait(5)
#all_countries=[]
cntry_el = webD.find_elements_by_xpath('//*[#id="Chp1"]/option')
#cntry_grp = webD.find_elements_by_xpath('//*[#id="Chp1"]/optgroup')
grps=webD.find_elements_by_xpath('//*[#id="Chp1"]/optgroup/option[1]')
for c in cntry_el:countries.append(c.text)
for g in grps: countries.append(g.text)
for cntry in countries:
select = Select(webD.find_element_by_id('Chp1'))#select country dropdown
select.select_by_visible_text(cntry)#choosing country
Btn_GO = webD.find_element_by_xpath('//*[#id="fsearch"]/p/input')
Btn_GO.click()
select_rpp = Select(webD.find_element_by_name('nbr_ref_pge'))#select results per page drop down
select_rpp.select_by_visible_text('100')#choosing 100 results per page option
university_form = webD.find_element_by_xpath('//*[#id="contenu"]').find_element_by_id('results')
university_list = university_form.find_elements_by_xpath('//*[#id="results"]/li') # list of university elements
for univ in range(len(university_list)):
href = university_list[univ].find_element_by_class_name('details').find_elements_by_tag_name('a')[0].get_property('href') # University details link
univ_links.append(href)
while True:
try:
webD.find_element_by_partial_link_text('Next').click()
university_form = webD.find_element_by_xpath('//*[#id="contenu"]').find_element_by_id('results')
university_list = university_form.find_elements_by_xpath('//*[#id="results"]/li')
for univ in range(len(university_list)):
href = university_list[univ].find_element_by_class_name('details').find_elements_by_tag_name('a')[0].get_property('href') # University details link
univ_links.append(href)
except NoSuchElementException: break
for l in univ_links:
webD.get(l)
webD.implicitly_wait(2)
title=webD.find_element_by_xpath('//*[#id="page"]/div/div/div[2]/div[1]').text
title_detailed = webD.find_element_by_xpath('//*[#id="page"]/div/div/div[2]/div[2]').text
cntry_name=webD.find_element_by_xpath('//*[#id="contenu"]/p[2]').text
t1=webD.find_elements_by_class_name('dt')
t2=webD.find_elements_by_class_name('dd')
labels=webD.find_elements_by_class_name('libelle')
content=webD.find_elements_by_class_name('contenu')
temp={}
fos=''
fos1=''
temp.update({'Title': title,'Detailed Title':title_detailed,'Country':cntry_name})
for i in range(len(t1)):
if t1[i].text == '' or t1[i].text == 'Address':
continue
else:
value=t2[i].text
temp.update({t1[i].text:value.replace('\n',',')})
for j in range(len(content)):
if labels[j].text in fields:
if labels[j].text == 'Fields of study:':
info=content[j].text
fos=fos+','+info
elif labels[j].text == 'Job title:':
info1=content[j].text
fos1=fos1+','+info1
else:
key=labels[j].text
temp.update({key[:-1]: content[j].text})
temp.update({'Fields of study': fos.lstrip(','),'Job titles':fos1.lstrip(',')})
result.append(temp)
data=pd.DataFrame(result)
data
end=time.time()
print("Time taken : "+ str(end-begin) +"s")
data.to_csv("WHED1.csv",index=False)
this code what i could use taken from github project.
would be great if i can re-create the data and save it, want this to be used as a dropdown in a web application just to make sure no mistakes written in the university studied in.
Update 1/12/22 - Async
Found a much better solution using aiohttp, it also runs the entire list of countries in ~30 seconds instead of 3 hours
import json
import time
import aiohttp
import asyncio
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.remote.webelement import WebElement
from selenium.webdriver.support.select import Select
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
def main():
print("Init")
driver = init_driver()
print("Opening Homepage")
url = "https://www.whed.net/results_institutions.php"
driver.get(url)
time.sleep(1)
print("Gathering Countries")
countries = get_countries(driver)
driver.quit()
print("Scraping")
start = time.time()
institution_list = asyncio.run(fetch_all(countries))
print("Writing out")
f = open('output.json', 'w')
f.write(json.dumps(institution_list))
f.close()
end = time.time()
print(f"Total time: {end - start}s")
def init_driver():
chrome_executable = Service(executable_path='chromedriver.exe', log_path='NUL')
chrome_options = Options()
chrome_options.add_argument("--headless")
driver = webdriver.Chrome(service=chrome_executable, options=chrome_options)
return driver
def get_countries(driver):
select = Select(driver.find_element(By.ID, "Chp1"))
countries = list(map(lambda c: c.get_attribute('value'), select.options))
countries.pop(0)
return countries
def extract_institutions(html, country):
soup = BeautifulSoup(html, 'html.parser')
page = soup.find('p', {'class': 'infos'}).text
print(str(page))
number_of_institutions = str(page).split()[0]
if number_of_institutions == 'No':
print(f"No results for {country}")
return []
results = []
inst_index = 0
raw = soup.find_all('a', {'class': 'fancybox fancybox.iframe'})
for i in raw:
results.append({
'name': str(i.text).strip(),
'url': 'https://www.whed.net/' + str(i.attrs['href']).strip(),
'country': country
})
inst_index += 1
return {
'country': country,
'count': number_of_institutions,
'records': results
}
async def get_institutions(country, session):
try:
async with session.post(
url='https://www.whed.net/results_institutions.php',
data={"Chp1": country, "nbr_ref_pge": 10000}
) as response:
html = await response.read()
print(f"Successfully got {country}")
return extract_institutions(html, country)
except Exception as e:
print(f"Unable to get {country} due to {e.__class__}.")
async def fetch_all(countries):
async with aiohttp.ClientSession() as session:
return await asyncio.gather(*[get_institutions(country, session) for country in countries])
# Main call
main()
Old answer using synchronous algorithm
Improving on #Mithun's answer since it doesn't really work as it'll be stuck on the same page.
Also added direct access to the name and url to make it easier in case you want to access those.
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.select import Select
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
print("Init")
chrome_executable = Service(executable_path='chromedriver.exe', log_path='NUL')
chrome_options = Options()
chrome_options.add_argument("--headless")
driver = webdriver.Chrome(service=chrome_executable, options=chrome_options)
print("Opening Homepage")
url = "https://www.whed.net/results_institutions.php"
driver.get(url)
time.sleep(1)
print("Selecting country")
select = Select(driver.find_element(By.ID, "Chp1"))
country = "Albania"
select.select_by_visible_text(country)
time.sleep(.5)
print("Searching")
driver.find_element(By.XPATH, "//input[#value='Go']").click()
time.sleep(1)
print("Parsing")
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')
page = soup.find('p', {'class': 'infos'}).text
number_of_pages = str(page).split()[0]
counter = 10
results = []
while True:
raw = soup.find_all('a', {'class': 'fancybox fancybox.iframe'})
for i in raw:
results.append({
'name': str(i.text).strip(),
'url': 'https://www.whed.net/' + str(i.attrs['href']).strip(),
'country': country
})
print(f'{len(results)}/{number_of_pages}')
if counter >= int(number_of_pages):
break
counter += 10
driver.find_element(By.LINK_TEXT, "Next page").click()
time.sleep(0.5)
soup = BeautifulSoup(driver.page_source, 'html.parser')
driver.quit()
print(results)
You can use Selenium to scrape data. The following code will help you scrape the university names for "United States of America (all)". Similarly, you can scrape for other countries as well using Loop or entering the name manually. If you need the field of study for every university, you can scrape its href using bs4 and its field of study.
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.select import Select
driver = webdriver.Chrome(r"chromedriver.exe")
url = "https://www.whed.net/results_institutions.php"
driver.get(url)
time.sleep(1)
select = Select(driver.find_element(By.ID, "Chp1"))
select.select_by_visible_text("United States of America (all)")
time.sleep(1)
driver.find_element(By.XPATH, "//input[#value='Go']").click()
time.sleep(1)
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')
page = soup.find('p', {'class': 'infos'}).text
number_of_pages = str(page).split()[0]
counter = 10
while counter < int(number_of_pages):
raw = soup.find_all('div', {'class': 'details'})
for i in raw:
i = (str(i.text).lstrip())
i = i.replace("\n","")
i = i.replace("\r", "")
i = i.replace("\t", "")
print(i)
next_page = driver.find_element(By.LINK_TEXT, "Next page").click()
counter += 10
driver.quit()
i am trying to extract the table of qualifications in scope but i am having hard time doing it since its my first time. can anyone please help me
url of the website i am scraping : https://training.gov.au/Organisation/Details/31102
enter code here
import re
import csv
import time
from pathlib import Path
import details as details
from selenium import webdriver
import bs4 as bs4
import os
import copy
option = webdriver.ChromeOptions()
option.add_argument(" - incognito")
option.add_argument("headless")
exec_path = '/Users/Downloads/MUR_scraping-master/Libraries/chromedriver'
browser = webdriver.Chrome(executable_path=exec_path, options=option)
# read the url from each file into a list
course_links_file_path = Path(os.getcwd().replace('\\', '/'))
course_links_file_path = course_links_file_path.__str__() + '/links.txt'
course_links_file = open(course_links_file_path, 'r')
# the csv file we'll be saving the courses to
csv_file_path = Path(os.getcwd().replace('\\', '/'))
csv_file = csv_file_path.__str__() + '/Reading_undergraduate.csv'
for each_url in course_links_file:
# print(each_url)
try:
browser.get(each_url)
except:
print(each_url)
pass
pure_url = each_url.strip()
each_url = browser.page_source
delay_ = 15
soup = bs4.BeautifulSoup(each_url, 'lxml')
desc_div = soup.find('div', class_='t-content t-state-active')
if desc_div:
desc_list = []
desc_p_list = desc_div.find_all(class_='display-row')
if desc_p_list:
for p in desc_p_list:
desc_list.append(p.get_text())
desc_list = ' '.join(desc_list)
#print(desc_list)
table = soup.find('table')
table_rows = table.find_all('tr')
for tr in table_rows:
td = tr.find_all('td')
row = [i.text for i in td]
print(row)
Not my best code but the following scraps the table into a 2D array. My solution is a bit sloppy but I hope this is something you can work with.
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import bs4 as bs4
option = webdriver.ChromeOptions()
option.add_argument("--incognito")
option.add_argument("--headless")
exec_path = "TODO: Add your path"
browser = webdriver.Chrome(executable_path=exec_path, options=option)
browser.get("https://training.gov.au/Organisation/Details/31102")
# open the scope tab
browser.find_element_by_css_selector('a#detailsScopeTab').click()
# wait for the table to load
element = WebDriverWait(browser, 20).until(
EC.presence_of_element_located(
(By.XPATH, '//div[#id="ScopeQualification"]//div[#class="t-pagesize-wrapper"]')))
# click on the button to have all rows at once
browser.find_element_by_xpath('//div[#id="ScopeQualification"]//div[#class="t-pagesize-wrapper"]/a[last()]').click()
# wait for the table loads the new data
class element_css_class_flashed(object):
def __init__(self, locator, css_class):
self.locator = locator
self.css_class = css_class
self.was_on = False
def __call__(self, driver):
element = driver.find_element(*self.locator)
if self.css_class in element.get_attribute("class"):
self.was_on = True
elif self.was_on:
return element
else:
return False
try:
wait = WebDriverWait(browser, 3)
element = wait.until(element_css_class_flashed(
(By.XPATH, '//div[#id="ScopeQualification"]//div[#class="t-status"]/a'),
"t-loading"))
except:
# most likely the loading was too fast to detect
pass
soup = bs4.BeautifulSoup(browser.page_source, 'html.parser')
table = soup.select_one('div#ScopeQualification table')
all_rows = table.find_all('tr')
header_row = all_rows[0]
rows = all_rows[1:-1]
data = [[col.text for col in header_row.find_all('th')]]
for row in rows:
data_row = []
for col in row.find_all('td'):
data_row.append(col.text)
data.append(data_row)
print(data)
I want to scrape the table from this website:
https://www.oddsportal.com/moving-margins/
I need data inside the table #moving_margins_content_overall
I tried this code but some games contains many class="odd" and I don't know how to associate the class="odd" data with the class="dark" data
import requests
from bs4 import BeautifulSoup
import time
import json
import csv
from selenium import webdriver
u = 'https://www.oddsportal.com/moving-margins/'
driver = webdriver.Chrome(executable_path=r"C:\chromedriver.exe")
driver.get(u)
driver.execute_script("window.scrollTo(0, document.body.scrollHeight)")
driver.implicitly_wait(60) # seconds
time.sleep(2)
elem = driver.find_element_by_xpath("//*")
source_code = elem.get_attribute("innerHTML")
soup = BeautifulSoup(source_code, 'html.parser')
for k in soup.select('#moving_margins_content_overall .table-main tbody tr'):
sport = k.select_one('tr.dark th > a').get_text(strip=True) #sport
country = soup.select_one('tr.dark th a:nth-child(3) span').get_text(strip=True) #country
competition = soup.select_one('tr.dark th a:nth-child(5)').get_text(strip=True) #sport
You can use below code to store all the data in a list in which each row in the page is stored as list.
u = 'https://www.oddsportal.com/moving-margins/'
driver = webdriver.Chrome(executable_path=r"C:\chromedriver.exe")
driver.maximize_window()
driver.get(u)
#Use Explicit time wait for fast execution
WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.CSS_SELECTOR, "#moving_margins_content_overall")))
driver.execute_script("window.scrollTo(0, document.body.scrollHeight)")
table_data = driver.find_elements_by_xpath("//div[#id='moving_margins_content_overall']//tr[#class='odd' or #class='dark']")
table =[]
# Creating a list of lists, where each list consist all data in each row either with class dark or odd
for data in table_data:
row = []
dark_row = data.find_elements_by_xpath((".//th//a"))
for col in dark_row:
row.append(col.text.replace("\n"," "))
row.append(data.find_element_by_xpath(".//following-sibling::tr//th[#class='first2']").text)# Add data in first2 th
odd_row = data.find_elements_by_xpath((".//following-sibling::tr[#class='odd']//td"))
for col in odd_row:
row.append(col.text.replace("\n", " "))
row.append(odd_row[-1].find_element_by_xpath('.//a').get_attribute("title")) #Add bookmaker name
table.append(row)
for t in table:
print(t)
Output As you can see for rugby union match there are two odds so list for that game is long.
I am scraping one page but the problem i came up today was that the page didn`t have another page and it gave me the previous page without any error from which i could determine that page was last one..
for ex: https://example/page-7
when i want to go to: https://example/page-8 which doesn`t exist it gives me
the last page: https://example/page-7
How could i determine that https://example/page-7 was the last page using python3???
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
import urllib.request
page = "https://www.supermama.lt/forumas/topic/214375-vilma/page-1"
driver = webdriver.Firefox()
driver.get(page)
dd = driver.page_source
for j in range(100):
soup = BeautifulSoup(dd, "html.parser")
my_text = list(soup.findAll("div", class_ = "post-content"))
for i in my_text:
#collect some data
pass
page = "https://www.supermama.lt/forumas/topic/214375-vilma/page{0}".format(j+2)
driver.get(page)
dd = driver.page_source
At first i was thinking about checking dublicates of collected data but this is too slow cause i have 30 000 links from which i have to collect data. Maybe there is easier solution??
Found the answer to my own question.
To find the page url just use driver.current_url
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
import urllib.request
page = "https://www.supermama.lt/forumas/topic/214375-vilma/"
driver = webdriver.Firefox()
driver.get(page)
dd = driver.page_source
current_pages = []
for j in range(100):
page_url = driver.current_url
if(page_url not in current_pages):
current_pages.append(page_url)
soup = BeautifulSoup(dd, "html.parser")
my_text = list(soup.findAll("div", class_ = "post-content"))
for i in my_text:
#collect some data
pass
page = "https://www.supermama.lt/forumas/topic/214375-vilma/page-{0}".format(j+2)
driver.get(page)
dd = driver.page_source
else:
print(current_pages)
driver.quit()
break