Scrape google maps review text data for one company - python

I wanted to scrape text review data from google maps review for one company in order to perform sentiment analysis. However, my code is not running! I am getting error. I was wondering if you could guide me to fix this. Thanks!
!pip install selenium
!apt-get update
!apt install chromium-chromedriver
from selenium import webdriver
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
wd = webdriver.Chrome('chromedriver',chrome_options=chrome_options)
driver =webdriver.Chrome('chromedriver',chrome_options=chrome_options)
#add your google map link whose data you want to scrape
from selenium import webdriver
from bs4 import BeautifulSoup
import time
import io
import pandas as pd
from selenium import webdriver
from bs4 import BeautifulSoup
import time
import io
import pandas as pd
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
driver.get('https://www.google.com/maps/place/Embassy+of+Bangladesh/#38.9418017,-77.0679642,15z/data=!4m7!3m6!1s0x0:0x5621455e7625f36e!8m2!3d38.9418017!4d-77.0679642!9m1!1b1')
wait = WebDriverWait(driver, 10)
menu_bt = wait.until(EC.element_to_be_clickable(
(By.XPATH, '//button[#data-value=\'Sort\']'))
)
menu_bt.click()
recent_rating_bt = driver.find_elements_by_xpath(
'//div[#role=\'menuitem\']')[50]
recent_rating_bt.click()
time.sleep(5)
Error message:
---------------------------------------------------------------------------
IndexError Traceback (most recent call last)
<ipython-input-35-94b4c6e89470> in <module>()
5 menu_bt.click()
6 recent_rating_bt = driver.find_elements_by_xpath(
----> 7 '//div[#role=\'menuitem\']')[50]
8 recent_rating_bt.click()
9 time.sleep(5)
IndexError: list index out of range

You're accessing the item indexed by 50 on the list returned by find_elements_by_xpath(). The error message indicates that this index does not exist, i.e. the returned list is smaller than that.
You should check the length of the returned list before accessing it.

Related

Retrieving specific matches from a list in python

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver import ActionChains
from selenium.webdriver.common.by import By
from time import sleep
from datetime import datetime
import pandas as pd
import warnings
import os
os.chdir('C:/Users/paulc/Documents/Medium Football')
warnings.filterwarnings('ignore')
base_url = 'https://www.sportingindex.com/spread-betting/football/international-world-cup'
option = Options()
option.headless = False
driver = webdriver.Chrome("C:/Users/paulc/Documents/Medium Football/chromedriver.exe",options=option)
driver.get(base_url)
links = [elem.get_attribute("href") for elem in driver.find_elements(By.TAG_NAME,"a")]
this code retrieves all the href links on this page. I want to search the links list and return only the matches that contain 'https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a'
however I get the AttributeError: 'NoneType' object has no attribute 'startswith'
using
import re
[x for x in links if x.startswith('https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a')]
help is appreciated.
Instead of collecting all a elements on the page where will be a lot of irrelevant results you can use more precise locator.
So, instead of
driver.find_elements(By.TAG_NAME,"a")
Use this:
driver.find_elements(By.XPATH,"//a[contains(#href,'https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a')]")
This will give you desired elements only.
And this
links = [elem.get_attribute("href") for elem in driver.find_elements(By.XPATH,"//a[contains(#href,'https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a')]")]
will directly give you the wanted links only.
UPD
In case this is giving you an empty list you possibly are missing a delay. So, you can simply add some pause before that line, like time.sleep(2) but it's better to use WebDriverWait expected_conditions explicit waits for that.
I can't check it since my computer is blocking that link due to my company policy since that is a gambling site, but normally something like this should work:
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
wait = WebDriverWait(driver, 10)
links = [elem.get_attribute("href") for elem in wait.until(EC.visibility_of_all_elements_located((By.XPATH, "//a[contains(#href,'https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a')]")))]
The following code is filtering to grab the right links
import time
from bs4 import BeautifulSoup
import pandas as pd
from selenium.webdriver.chrome.service import Service
from selenium import webdriver
webdriver_service = Service("./chromedriver") #Your chromedriver path
driver = webdriver.Chrome(service=webdriver_service)
driver.get('https://www.sportingindex.com/spread-betting/football/international-world-cup')
driver.maximize_window()
time.sleep(8)
soup = BeautifulSoup(driver.page_source,"lxml")
for u in soup.select('a[class="gatracking"]'):
link = 'https://www.sportingindex.com' + u.get('href')
if '-v-' in link:
print(link)
Output:
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.24fdf8f5-b69b-4341-b6b4-d27605f7f7fd/spain-v-germany
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.c9bdf787-791a-47e0-b77c-a2d4cf567bfd/cameroon-v-serbia
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.5eddaa44-666b-47dc-8a0f-4ac758de00dc/south-korea-v-ghana
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.70cefd39-60f7-415e-9cb5-7a56acd403d6/brazil-v-switzerland
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.7fe0285e-366f-4f3c-b77f-4c96077a6c71/portugal-v-uruguay
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.dd7a995d-7478-45f8-af27-9f234d37cc76/ecuador-v-senegal
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.92232207-0f1e-4bb1-bacd-1332ef6b9007/netherlands-v-qatar
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.b913620e-69c7-4606-a153-7b48589b7c94/iran-v-usa
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.7a4a18fb-d4ee-4880-849f-f1afdea33cd5/wales-v-england
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.20c098b4-4e97-4fd1-97b0-f42d84424361/australia-v-denmark
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.5a7476e2-8d35-4a8e-8065-b4339e79f395/tunisia-v-france
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.8a869f02-9dd0-49c5-91bd-209ee224fc2a/poland-v-argentina
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.6379b787-f246-4ba4-a896-28a97396d02f/saudi-arabia-v-mexico
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.52737cfd-da19-42dd-b15b-c16c3e8e9a86/canada-v-morocco
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.168fab1f-8360-4e87-ba84-bfbd11a4a207/croatia-v-belgium
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.9fb541f0-43a4-409c-8e54-e34a43965714/costa-rica-v-germany
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.7379c8a7-ab5d-4653-b487-22bf7ff8eefe/japan-v-spain
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.e7e4c6be-98b7-4258-ba40-74c54a790fe1/ghana-v-uruguay
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.e4c18c81-565e-47ce-b08d-9aed62c88a5d/south-korea-v-portugal
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.18f44028-e23d-48d4-970b-e75c164589bd/cameroon-v-brazil
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.526f9b1b-6d95-4f44-abce-e0a6a30acfd4/serbia-v-switzerland
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.24fdf8f5-b69b-4341-b6b4-d27605f7f7fd/spain-v-germany
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.c9bdf787-791a-47e0-b77c-a2d4cf567bfd/cameroon-v-serbia
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.5eddaa44-666b-47dc-8a0f-4ac758de00dc/south-korea-v-ghana
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.70cefd39-60f7-415e-9cb5-7a56acd403d6/brazil-v-switzerland
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.7fe0285e-366f-4f3c-b77f-4c96077a6c71/portugal-v-uruguay
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.24fdf8f5-b69b-4341-b6b4-d27605f7f7fd/spain-v-germany
https://www.sportingindex.com/spread-betting/rugby-union/france-top-14/group_a.ad22f34f-9cd6-47b4-a826-0c0f0dce7df2/lyon-v-toulouse
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.24fdf8f5-b69b-4341-b6b4-d27605f7f7fd/spain-v-germany
https://www.sportingindex.com/spread-betting/rugby-union/france-top-14/group_a.ad22f34f-9cd6-47b4-a826-0c0f0dce7df2/lyon-v-toulouse
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.c9bdf787-791a-47e0-b77c-a2d4cf567bfd/cameroon-v-serbia
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.5eddaa44-666b-47dc-8a0f-4ac758de00dc/south-korea-v-ghana
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.70cefd39-60f7-415e-9cb5-7a56acd403d6/brazil-v-switzerland
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.7fe0285e-366f-4f3c-b77f-4c96077a6c71/portugal-v-uruguay

How to scrape the ratings and all the reviews from the website using selenium

I want to scrape the rating and all the reviews on the page .But not able to find the path .
enter code here
import urllib.request
from bs4 import BeautifulSoup
import csv
import os
from selenium import webdriver
from selenium.webdriver.support.select import Select
from selenium.webdriver.common.keys import Keys
import pandas as pd
import time
chrome_path =r'C:/Users/91940/AppData/Local/Programs/Python/Python39/Scripts/chromedriver.exe'
driver = webdriver.Chrome(executable_path=chrome_path)
driver.implicitly_wait(10)
driver.get("https://www.lazada.sg/products/samsung-galaxy-watch3-bt-45mm-titanium-i1156462257-
s4537770883.html?search=1&freeshipping=1")
product_name = driver.find_element_by_xpath('//*[#id="module_product_title_1"]/div/div/h1')
print(product_name.text)
rating = driver.find_element_by_xpath("//span[#class='score-average']")
print(rate.text)
review = driver .find_element_by_xpath('//*
[#id="module_product_review"]/div/div/div[3]/div[1]/div[1]')
print(review.text)
I believe print(product_name.text) is getting execute correct, right ?
There is an issue with driver.find_element_by_xpath("//span[#class='score-average']") I could not found score-average anywhere in HTML source.
so try this instead :
driver.find_element_by_css_selector("div.pdp-review-summary")
print(rate.text)
You can try the below code to get review :
wait = WebDriverWait(driver, 10)
driver.get("https://www.lazada.sg/products/samsung-galaxy-watch3-bt-45mm-titanium-i1156462257- s4537770883.html?search=1&freeshipping=1")
wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "a[class$='pdp-review-summary__link']"))).click()
ActionChains(driver).move_to_element(wait.until(EC.visibility_of_element_located((By.XPATH, "//h2[contains(text(), 'Ratings & Reviews')]")))).perform()
wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "div.item-content")))
for review in driver.find_elements(By.CSS_SELECTOR, "div.item-content"):
print(review.get_attribute('innerHTML'))
Imports :
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
Perhaps there is a problem with your path? (apologies I'm not on windows to test). From memory, Windows paths use \ characters instead of /. Additionally, you may need two backticks after the drive path (C:\\).
c:\\Users\91940\AppData\Local\...

Web Scraping with Selenium on Python using Google Chrome

I am trying to scape a website to get some company information. If the search result is there and matches the search term I would like to continue, if not, I would like to move on to the next company.
Here is the code:
import pandas as pd
import numpy as np
from tqdm import notebook
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from time import time, sleep
import datetime
import sys
url = "https://register.fca.org.uk/s/"
search_box_path = '//*[#id="search-form-search-section-main-input"]'
firm_checkbox_path = '//*[#id="search-form-search-options-radio-group"]/span[1]/label/span[1]'
searchterm = 'XXX Company'
driver = webdriver.Chrome(executable_path=r'C:\Users\XXXX\Chrome Webdriver\chromedriver.exe')
driver.get(url)
element = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH,firm_checkbox_path)))
driver.find_element_by_xpath(firm_checkbox_path).click()
driver.find_element_by_xpath(search_box_path).send_keys(searchterm)
driver.find_element_by_xpath(search_box_path).send_keys(Keys.RETURN)
element = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH,'//*
[#id="maincontent"]/div[4]/div/div[2]/h1/span[2]')))
element = driver.find_element_by_xpath('//*[#id="maincontent"]/div[4]/div/div[2]/h1/span[2]')
check_result()
The issue is with the check_result function. In this function I am just comparing the searchterm against the element.text of the element from the website.
def check_result():
name= driver.find_element_by_xpath('//*[#id="maincontent"]/div[4]/div/div[2]/h1/span[2]')
return name.text == searchterm:
This logic on its own works fine, but along with the code it give me false even though I know that the text I provide is equal to the element.text.
Any help is much appreciated.

How to find window/iframe from Chrome DevTools

I'm trying to web scrape using Selenium, Python and Beautiful Soup. I am scraping this page, but I want to scrape information off the pop-up window that appears when you click on the 'i' (information) icons in the corner of each product. My code is as follows:
import requests
from bs4 import BeautifulSoup
import time
import selenium
import math
from selenium.webdriver.support.ui import WebDriverWait
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
import chromedriver_binary
import re
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Chrome(ChromeDriverManager().install())
r = requests.get('https://dmarket.com/csgo-skins/product-card/ak-47-redline/field-tested')
driver.get('https://dmarket.com/csgo-skins/product-card/ak-47-redline/field-tested')
html_getter = BeautifulSoup(r.text, "html.parser")
data = html_getter.findAll(attrs={"class":"c-asset__priceNumber"})
dataskin = html_getter.findAll(attrs={"class" : "c-asset__exterior"})
time.sleep(2)
driver.find_element_by_id("onesignal-slidedown-cancel-button").click()
time.sleep(2)
driver.find_element_by_class_name("c-dialogHeader__close").click()
time.sleep(30)
driver.find_element_by_class_name("c-asset__action--info").click()
time.sleep(30)
price_element = driver.switch_to.active_element
print("<<<<<TEXT>>>>>")
print(price_element.text)
print("<<<<<END>>>>>")
driver.close()
However, when I run this, the only text that prints are "close." If you inspect the information page pop-up, it should print out the price, data from the chart, etc. How can I get it to print this info? Specifically, I want the amount sold on the most recent day and the price listed on the chart on the most recent day (both seem to be accessible in Chrome DevTools). I don't think I'm looking at the wrong frame, as I switch to the active frame, so I'm not sure how to fix this!

Webscraping using Selenium in Python

I am trying to scrape data from the Sunshine List website (http://www.sunshinelist.ca/) using the BeautifulSoup library and the Selenium package (in order to deal with the 'Next' button on the webpage). I know there are several related posts but I just can't identify where and how I should explicitly ask the driver to wait.
Error: StaleElementReferenceException: Message: The element reference
of stale: either the element is no longer attached to
the DOM or the page has been refreshed
This is the code I have written:
import numpy as np
import pandas as pd
import requests
import re
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import StaleElementReferenceException
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
ffx_bin = FirefoxBinary(r'C:\Users\BhagatM\AppData\Local\Mozilla Firefox\firefox.exe')
ffx_caps = DesiredCapabilities.FIREFOX
ffx_caps['marionette'] = True
driver = webdriver.Firefox(capabilities=ffx_caps,firefox_binary=ffx_bin)
driver.get("http://www.sunshinelist.ca/")
driver.maximize_window()
tablewotags1=[]
while True:
divs = driver.find_element_by_id('datatable-disclosures')
divs1=divs.find_elements_by_tag_name('tbody')
for d1 in divs1:
div2=d1.find_elements_by_tag_name('tr')
for d2 in div2:
tablewotags1.append(d2.text)
try:
driver.find_element_by_link_text('Next →').click()
except NoSuchElementException:
break
year1=tablewotags1[0::10]
name1=tablewotags1[3::10]
position1=tablewotags1[4::10]
employer1=tablewotags1[1::10]
df1=pd.DataFrame({'Year':year1,'Name':name1,'Position':position1,'Employer':employer1})
df1.to_csv('Sunshine List-1.csv', index=False)
I think you just need to point to the correct firefox Binary. Also, Which version of Firefox are you using? Looks like it's one of the newer versions, this should do if thats the case.
ffx_bin = FirefoxBinary(r'pathtoyourfirefox')
ffx_caps = DesiredCapabilities.FIREFOX
ffx_caps['marionette'] = True
driver = webdriver.Firefox(capabilities=ffx_caps,firefox_binary=ffx_bin)
Cheers
EDIT: So in order to answer your new enquery, "why is not writting the CVS" you should do so like this:
import csv # You are missing this import
ls_general_list = []
def csv_for_me(list_to_csv):
with open(pathtocsv, 'a', newline='') as csvfile:
sw = csv.writer(csvfile, delimeter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL)
for line in list_to_csv:
for data in line:
sw.writerow(data)
Then replace this in you code, df=pd.DataFrame({'Year':year,'Name':name,'Position':position,'Employer':employer})
for this one, ls.general_list.append(('Year':year,'Name':name,'Position':position,'Employer':employer))
then do so like this,
csv_for_me(ls_general_list)
Please accept the answer if it's satisfactory and now you have a csv

Categories