Python Selenium - Scraping Trip Advisor Reviews

Python Selenium - Scraping Trip Advisor Reviews - python

I am only a hobbyist with Python, so please bare with me. I am trying to run this script to collect Trip Advisor reviews and write to excel.
But once it opens up the website it throws this error: NoSuchElementException no such element: Unable to locate element: {"method":"xpath","selector":".//q[#class='IRsGHoPm']"}
Any got any ideas on what is going wrong?
import csv #This package lets us save data to a csv file
from selenium import webdriver #The Selenium package we'll need
from selenium.webdriver.support import expected_conditions
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import StaleElementReferenceException
import time #This package lets us pause execution for a bit
path_to_file = "E:Desktop/Data/Reviews.csv"
pages_to_scrape = 3
url = "https://www.tripadvisor.com/Hotel_Review-g60982-d209422-Reviews-Hilton_Waikiki_Beach-Honolulu_Oahu_Hawaii.html"
# import the webdriver
driver = webdriver.Chrome()
driver.get(url)
# open the file to save the review
csvFile = open(path_to_file, 'a', encoding="utf-8")
csvWriter = csv.writer(csvFile)
# change the value inside the range to save the number of reviews we're going to grab
for i in range(0, pages_to_scrape):
# give the DOM time to load
time.sleep(5)
# Click the "expand review" link to reveal the entire review.
driver.find_element_by_xpath(".//div[contains(#data-test-target, 'expand-review')]").click()
# Now we'll ask Selenium to look for elements in the page and save them to a variable. First lets define a container that will hold all the reviews on the page. In a moment we'll parse these and save them:
container = driver.find_elements_by_xpath("//div[#data-reviewid]")
# Next we'll grab the date of the review:
dates = driver.find_elements_by_xpath(".//div[#class='_2fxQ4TOx']")
# Now we'll look at the reviews in the container and parse them out
for j in range(len(container)): # A loop defined by the number of reviews
# Grab the rating
rating = container[j].find_element_by_xpath(".//span[contains(#class, 'ui_bubble_rating bubble_')]").get_attribute("class").split("_")[3]
# Grab the title
title = container[j].find_element_by_xpath(".//div[contains(#data-test-target, 'review-title')]").text
#Grab the review
review = container[j].find_element_by_xpath(".//q[#class='IRsGHoPm']").text.replace("\n", " ")
#Grab the data
date = " ".join(dates[j].text.split(" ")[-2:])
#Save that data in the csv and then continue to process the next review
csvWriter.writerow([date, rating, title, review])
# When all the reviews in the container have been processed, change the page and repeat
driver.find_element_by_xpath('.//a[#class="ui_button nav next primary "]').click()
# When all pages have been processed, quit the driver
driver.quit()
enter image description here

I am trying to finding "_2fxQ4TOx" class name in the page but it came up with 0 results. May be these are the auto-generated classes by browser to keep up with compression and its own CSS reference.
Instead, try to find elements by these attributes (which actually exist in the frontend), try finding it like:
container = driver.find_element_by_css_selector('div[data-test-target="reviews-tab"]')
reviews = container.find_elements_by_css_selector('div[data-test-target="HR_CC_CARD"]')
for review in reviews:
# get date
# get descriptions
# ...
you can also refer to this link:
Is there a way to find an element by attributes in Python Selenium?

Related

retrieving chrome driver element data from returned form

I'm trying to build a simple scraper with selenium to retrieve zip code for a given address, city, ST from this USPS tool: https://tools.usps.com/zip-code-lookup.htm?byaddress
Here is my code which is working on most steps but I am struggling to get the data I need to retrieve at the very end (the zip code):
from selenium import webdriver
import time
import pandas as pd
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.by import By
# step 1: set path to chromedriver downloaded from here: https://chromedriver.chromium.org/downloads
PATH = "chromedriver" # modify to location on your system of above downloaded driver
driver = webdriver.Chrome(PATH)
driver.get("https://tools.usps.com/zip-code-lookup.htm?byaddress")
# step 2: specify the address to search for
street_address = "530 WILLIAM PENN PL"
city = "PITTSBURGH"
state = "PA"
# step 3: fill out the form with specified data in step 2
input_address = driver.find_element_by_id('tAddress')
input_city = driver.find_element_by_id('tCity')
drpState = Select(driver.find_element_by_id('tState'));
input_address.send_keys(street_address)
time.sleep(1)
input_city.send_keys(city)
time.sleep(1)
drpState.select_by_value(state)
# step 4: select "Find button" on USPS page to advance
button_find = driver.find_element_by_id('zip-by-address')
button_find.click()
time.sleep(2)
# step 5: retrieve zip code (the problem)
zipcode= driver.find_element(By.XPATH, '//*[#id="zipByAddressDiv"]/ul/li[1]/div[1]/p[3]/strong')
attrs=[]
for attr in zipcode.get_property('attributes'):
attrs.append([attr['name'], attr['value']])
print(attrs)
As you can see in below screenshot, at the very end I specify an XPATH which I obtained by inspecting the zip code. I then try to list the attributes of the zipcode WebDriver object but it comes out empty, there is no error just nothing returns in the attributes of the object.
Would appreciate any help, thank you in advance.

I have no idea why you try to use get_attributes.
To get some attributes this page would need <strong name="..." value="..."> but it has only <strong>
If you want tag name then use zipcode.tag_name
and if you want text 15219-1820 inside <strong> </strong> then use zipcode.text
zipcode = driver.find_element(By.XPATH, '//*[#id="zipByAddressDiv"]/ul/li[1]/div[1]/p[3]/strong')
print(zipcode.tag_name)
print(zipcode.text)

(Referencing off the image)
You could instead get the element zipcode-by-address and get it's child classes and find the strong
driver = webdriver.Firefox()
... # navigation stuff here.
element = driver.find_element_by_class_name("zipcode-by-address")
all_children_by_xpath = header.find_elements_by_xpath(".//*") # https://stackoverflow.com/questions/24795198/get-all-child-elements

Beautifulsoup4 find_all not getting the results I need

I'm trying to get data from flashscore.com to a project I'm doing as a part of my self-tought Python study:
import requests
from bs4 import BeautifulSoup
res = requests.get("https://www.flashscore.com/")
soup = BeautifulSoup(res.text, "lxml")
games = soup.find_all("div", {'class':['event__match', 'event__match--scheduled', 'event__match--twoLine']})
print(games)
When I run this, it gets me an empty list []
Why?

When an empty list is returned in find_all(), that means the elements that you specified could not be found.
Make sure that what you are trying to scrape isn't dynamically added such as an iframe in some cases

The failure is due to the fact that the website uses a set of Ajax technologies, specifically dynamically added content with the help of the JavaScript client scripting language. The client code for scripting languages is executed in the browser itself, not at the web server level. The success of such code depends on the browser's ability to interpret and execute it correctly. With the help of the BeatifulSoup library in the program you wrote, you only check the HTML code. JavaScript code can be open, for example, with the help of the Selenium library: https://www.selenium.dev/. Below is the full code for the data that I suppose you are interested in:
# crawler_her_sel.py
import time
from selenium.webdriver import Firefox
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from bs4 import BeautifulSoup
import pandas as pd
def firefoxdriver(my_url):
"""
Preparing of the browser for the work and adding the headers to
the browser.
"""
# Preparing of the Tor browser for the work.
options = Options()
options.add_argument("--headless")
driver = Firefox(options=options)
return driver
def scrapingitems(driver, my_list, my_xpath):
"""
Create appropriate lists of the data for the pandas library.
"""
try:
elem_to_scrap = driver.find_element(By.XPATH, my_xpath).text
my_list.append(elem_to_scrap)
except:
elem_to_scrap = ""
my_list.append(elem_to_scrap)
# Variable with the URL of the website.
my_url = "https://www.flashscore.com/"
# Preparing of the Tor browser for the work and adding the headers
# to the browser.
driver = firefoxdriver(my_url)
# Loads the website code as the Selenium object.
driver.get(my_url)
# Prepare the blank dictionary to fill in for pandas.
matches = {}
# Preparation of lists with scraped data.
countries = []
leagues = []
home_teams = []
scores_home = []
scores_away = []
away_teams = []
# Wait for page to fully render
try:
element = WebDriverWait(driver, 25).until(
EC.presence_of_element_located((By.CLASS_NAME, "adsclick")))
except TimeoutException:
print("Loading took too much time!. Please rerun the script.")
except Exception as e:
print(str(e))
else:
# Loads the website code as the BeautifulSoup object.
pageSource = driver.page_source
bsObj = BeautifulSoup(pageSource, "lxml")
# Determining the number of the football matches with the help of
# the BeautifulSoup.
games_1 = bsObj.find_all(
"div", {"class":
"event__participant event__participant--home"})
games_2 = bsObj.find_all(
"div", {"class":
"event__participant event__participant--home fontBold"})
games_3 = bsObj.find_all(
"div", {"class":
"event__participant event__participant--away"})
games_4 = bsObj.find_all(
"div", {"class":
"event__participant event__participant--away fontBold"})
# Determining the number of the countries for the given football
# matches.
all_countries = driver.find_elements(By.CLASS_NAME, "event__title--type")
# Determination of the number that determines the number of
# the loop iterations.
sum_to_iterate = len(all_countries) + len(games_1) + len(games_2)
+ len(games_3) + len(games_4)
for ind in range(1, (sum_to_iterate+1)):
# Scraping of the country names.
xpath_countries = ('//div[#class="sportName soccer"]/div['+str(ind)
+']/div[2]/div/span[1]')
scrapingitems(driver, countries, xpath_countries)
# Scraping of the league names.
xpath_leagues = ('//div[#class="sportName soccer"]/div['+str(ind)
+']/div[2]/div/span[2]')
scrapingitems(driver, leagues, xpath_leagues)
# Scraping of the home team names.
xpath_home_teams = ('//div[#class="sportName soccer"]/div['+str(ind)
+']/div[3]')
scrapingitems(driver, home_teams, xpath_home_teams)
# Scraping of the home team scores.
xpath_scores_home = ('//div[#class="sportName soccer"]/div['+str(ind)
+']/div[5]')
scrapingitems(driver, scores_home, xpath_scores_home)
# Scraping of the away team scores.
xpath_scores_away = ('//div[#class="sportName soccer"]/div['+str(ind)
+']/div[6]')
scrapingitems(driver, scores_away, xpath_scores_away)
# Scraping of the away team names.
xpath_away_teams = ('//div[#class="sportName soccer"]/div['+str(ind)
+']/div[4]')
scrapingitems(driver, away_teams, xpath_away_teams)
# Add lists with the scraped data to the dictionary in the correct
# order.
matches["Countries"] = countries
matches["Leagues"] = leagues
matches["Home_teams"] = home_teams
matches["Scores_for_home_teams"] = scores_home
matches["Scores_for_away_teams"] = scores_away
matches["Away_teams"] = away_teams
# Creating of the frame for the data with the help of the pandas
# package.
df_res = pd.DataFrame(matches)
# Saving of the properly formatted data to the csv file. The date
# and the time of the scraping are hidden in the file name.
name_of_file = lambda: "flashscore{}.csv".format(time.strftime(
"%Y%m%d-%H.%M.%S"))
df_res.to_csv(name_of_file(), encoding="utf-8")
finally:
driver.quit()
The result of the script is a csv file, which, when loaded as data into Excel, gives the following table, e.g.:
It is worth mentioning here to download the necessary driver for your browser: https://www.selenium.dev/documentation/webdriver/getting_started/install_drivers/.
In addition, I give you links to two other interesting scripts that relate to scraping from the https://www.flashscore.com/ portal, i.e.: How can i scrape a football results from flashscore using python and Scraping stats with Selenium.
I would also like to raise legal issues here. The robots.txt file downloaded from the https://www.flashscore.com/robots.txt website looks like this:
It shows that you can scrape the home page. But the „General Terms of Use” says that quoting „Without prior authorisation in writing from the Provider, Visitors are not authorised to copy, modify, tamper with, distribute, transmit, display, reproduce, transfer, upload, download or otherwise use or alter any of the content of the App. ”
This, unfortunately, introduces ambiguity and ultimately it is not clear what the owner really wants. Therefore, I recommend that you do not use this script constantly, and certainly not for commercial purposes and I ask other visitors for this that visit this website. I myself wrote this script for the purpose of learning to scrape and I do not intend to use it at all.
The finished script can be downloaded from my GitHub.

Webscraping - Selenium - Python

I want to extract all the fantasy teams that have been entered for past contests. To loop through the dates, I just change a small part of the URL as shown in my code below:
#Packages:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as ec
import pandas as pd
# Driver
chromedriver =("C:/Users/Michel/Desktop/python/package/chromedriver_win32/chromedriver.exe")
driver = webdriver.Chrome(chromedriver)
# Dataframe that will be use later
results = pd.DataFrame()
best_lineups=pd.DataFrame()
opti_lineups=pd.DataFrame()
#For loop over all DATES:
calendar=[]
calendar.append("2019-01-10")
calendar.append("2019-01-11")
for d in calendar:
driver.get("https://rotogrinders.com/resultsdb/date/"+d+"/sport/4/")
Then, to access the different contests of that day, you need to click on the contest tab. I use the following code to locate and click on it.
# Find "Contest" tab
contest= driver.find_element_by_xpath("//*[#id='root']/div/main/main/div[2]/div[3]/div/div/div[1]/div/div/div/div/div[3]")
contest.click()
I simply inspect and copy the xpath of the tab. However, most of the times it is working, but sometimes I get an error message " Unable to locate element...". Moreover, it seems to work only for the first date in my calendar loop and always fails in the next iteration... I do not know why. I try to locate it differently, but I feel I am missing something such as:
contests=driver.find_element_by_xpath("//*[#role='tab']
Once, the contest tab is successfully clicked, all contests of that day are there and you can click on a link to access all the entries of that contest. I stored the contests in order to iterate throuhg all as follow:
list_links = driver.find_elements_by_tag_name('a')
hlink=[]
for ii in list_links:
hlink.append(ii.get_attribute("href"))
sub="https://rotogrinders.com/resultsdb"
con= "contest"
contest_list=[]
for text in hlink:
if sub in text:
if con in text:
contest_list.append(text)
# Iterate through all the entries(user) of a contest and extract the information of the team entered by the user
for c in contest_list:
driver.get(c)
Then, I want to extract all participants team entered in the contest and store it in a dataframe. I am able to do it successfully for the first page of the contest.
# Waits until tables are loaded and has text. Timeouts after 60 seconds
while WebDriverWait(driver, 60).until(ec.presence_of_element_located((By.XPATH, './/tbody//tr//td//span//a[text() != ""]'))):
# while ????:
# Get tables to get the user names
tables = pd.read_html(driver.page_source)
users_df = tables[0][['Rank','User']]
users_df['User'] = users_df['User'].str.replace(' Member', '')
# Initialize results dataframe and iterate through users
for i, row in users_df.iterrows():
rank = row['Rank']
user = row['User']
# Find the user name and click on the name
user_link = driver.find_elements(By.XPATH, "//a[text()='%s']" %(user))[0]
user_link.click()
# Get the lineup table after clicking on the user name
tables = pd.read_html(driver.page_source)
lineup = tables[1]
#print (user)
#print (lineup)
# Restructure to put into resutls dataframe
lineup.loc[9, 'Name'] = lineup.iloc[9]['Salary']
lineup.loc[10, 'Name'] = lineup.iloc[9]['Pts']
temp_df = pd.DataFrame(lineup['Name'].values.reshape(-1, 11),
columns=lineup['Pos'].iloc[:9].tolist() + ['Total_$', 'Total_Pts'] )
temp_df.insert(loc=0, column = 'User', value = user)
temp_df.insert(loc=0, column = 'Rank', value = rank)
temp_df["Date"]=d
results = results.append(temp_df)
#next_button = driver.find_elements_by_xpath("//button[#type='button']")
#next_button[2].click()
results = results.reset_index(drop=True)
driver.close()
However, there are other pages and to access it, you need to click on the small arrow next buttonat the bottom. Moreover, you can click indefinitely on that button; even if there are not more entries. Therefore, I would like to be able to loop through all pages with entries and stop when there are no more entries and change contest. I try to implement a while loop to do so, but my code did not work...

You must really make sure that page loads completely before you do anything on that page.
Moreover, it seems to work only for the first date in my calendar loop
and always fails in the next iteration
Usually when selenium loads a browser page it tries to look for the element even if it is not loaded all the way. I suggest you to recheck the xpath of the element you are trying to click.
Also try to see when the page loads completely and use time.sleep(number of seconds)
to make sure you hit the element or you can check for a particular element or a property of element that would let you know that the page has been loaded.
One more suggestion is that you can use driver.current_url to see which page are you targetting. I have had this issue while i was working on multiple tabs and I had to tell python/selenium to manually switch to that tab

Python and Selenium: I am automating web scraping among pages. How can I loop by Next button?

I already written several lines of codes to pull url from this website.
http://www.worldhospitaldirectory.com/United%20States/hospitals
code is below:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
import csv
driver = webdriver.Firefox()
driver.get('http://www.worldhospitaldirectory.com/United%20States/hospitals')
url = []
pagenbr = 1
while pagenbr <= 115:
current = driver.current_url
driver.get(current)
lks = driver.find_elements_by_xpath('//*[#href]')
for ii in lks:
link = ii.get_attribute('href')
if '/info' in link:
url.append(link)
print('page ' + str(pagenbr) + ' is done.')
if pagenbr <=114:
elm = driver.find_element_by_link_text('Next')
driver.implicitly_wait(10)
elm.click()
time.sleep(2)
pagenbr += 1
ls = list(set(url))
with open('US_GeneralHospital.csv', 'wb') as myfile:
wr = csv.writer(myfile,quoting=csv.QUOTE_ALL)
for u in ls:
wr.writerow([u])
And it worked very well to pull each individual links from this website.
But the problem is I need to change the page number I need to loop by myself every time.
I want to let this code upgrade to iterate by calculating how many time it need. Not by manually inputting.
Thank you very much.

This is bad idea to hardcode the number of pages in your script. Try just to click "Next" button while it is enabled:
from selenium.common.exceptions import NoSuchElementException
while True:
try:
# do whatever you need to do on page
driver.find_element_by_xpath('//li[not(#class="disabled")]/span[text()="Next"]').click()
except NoSuchElementException:
break
This should allow you to execute page scraping until the last page reached
Also note that using lines current = driver.current_url and driver.get(current) makes no sense at all, so you might skip them

Python Selenium Scrape Hidden Data

I'm trying to scrape the following page (just page 1 for the purpose of this question):
https://www.sportstats.ca/display-results.xhtml?raceid=4886
I can use Selinium to grab the source then parse it, but not all of the data that I'm looking for is in the source. Some of it needs to be found by clicking on elements.
For example, for the first person I can get all the visible fields from the source. But if you click the +, there is more data I'd like to scrape. For example, the "Chip Time" (01:15:29.9), and also the City (Oakville) that pops up on the right after clicking the + for a person.
I don't know how to identify the element that needs to be clicked to expand the +, then even after clicking it, I don't know how to find the values I'm looking for.
Any tips would be great.

Here is a sample code for your requirement. This code is base on python , selenium with crome exe file.
from selenium import webdriver
from lxml.html import tostring,fromstring
import time
import csv
myfile = open('demo_detail.csv', 'wb')
wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
driver=webdriver.Chrome('./chromedriver.exe')
csv_heading=["","","BIB","NAME","CATEGORY","RANK","GENDER PLACE","CAT. PLACE","GUN TIME","SPLIT NAME","SPLIT DISTANCE","SPLIT TIME","PACE","DISTANCE","RACE TIME","OVERALL (/814)","GENDER (/431)","CATEGORY (/38)","TIME OF DAY"]
wr.writerow(csv_heading)
count=0
try:
url="https://www.sportstats.ca/display-results.xhtml?raceid=4886"
driver.get(url)
table_tr=driver.find_elements_by_xpath("//table[#class='results overview-result']/tbody/tr[#role='row']")
for tr in table_tr:
lst=[]
count=count+1
table_td=tr.find_elements_by_tag_name("td")
for td in table_td:
lst.append(td.text)
table_td[1].find_element_by_tag_name("div").click()
time.sleep(5)
table=driver.find_elements_by_xpath("//div[#class='ui-datatable ui-widget']")
for demo_tr in driver.find_elements_by_xpath("//tr[#class='ui-expanded-row-content ui-widget-content view-details']/td/div/div/table/tbody/tr"):
for demo_td in demo_tr.find_elements_by_tag_name("td"):
lst.append(demo_td.text)
wr.writerow(lst)
table_td[1].find_element_by_tag_name("div").click()
time.sleep(5)
print count
time.sleep(5)
driver.quit()
except Exception as e:
print e
driver.quit()

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Python Selenium - Scraping Trip Advisor Reviews - python

Related

retrieving chrome driver element data from returned form

Beautifulsoup4 find_all not getting the results I need

Webscraping - Selenium - Python

Python and Selenium: I am automating web scraping among pages. How can I loop by Next button?

Python Selenium Scrape Hidden Data

Categories

Resources