How to move to the next enclosing(div) while scraping a site? - python

All the data in is populated from the first table. I cannot move to the next div and get the data of the td for each tr.
The site: https://asd.com/page/
Below is the code that I have written.
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()))
my_url= "https://asd.com/page/asd"
driver.get(my_url)
boxes = driver.find_elements(By.CLASS_NAME, "col-md-4")
companies = []
company = {}
for box in boxes:
header = box.find_element(By.CLASS_NAME,"text-primary.text-uppercase")
company['name']= header.text
td= box
company['Type']= td.find_element(By.XPATH,"//div/div/div/table/tbody/tr[1]/td").text
company['Capital']= td.find_element(By.XPATH,"//div/div/div/table/tbody/tr[2]/td").text
company['Address'] = td.find_element(By.XPATH,"//div/div/div/table/tbody/tr[3]/td").text
company['Owner'] = td.find_element(By.XPATH,"//div/div/div/table/tbody/tr[4]/td").text
company['Co-Owner'] = td.find_element(By.XPATH,"//div/div/div/table/tbody/tr[5]/td").text
company['Duration'] = td.find_element(By.XPATH,"//div/div/div/table/tbody/tr[6]/td").text
company['Place'] = td.find_element(By.XPATH,"//div/div/div/table/tbody/tr[7]/td").text
company['Company ID'] = td.find_element(By.XPATH,"//div/div/div/table/tbody/tr[8]/td").text
companies.append(company)
print(company)

There are several issues here:
You need to add some delay between driver.get(my_url) and boxes = driver.find_elements(By.CLASS_NAME, "col-md-4") to let the elements loaded before getting the list of all of them.
text-primary.text-uppercase is actually 2 class names: text-primary and text-uppercase so you should use XPATH or CSS_SELECTOR to locate element by 2 class names, not by CLASS_NAME.
In order to locate elements inside another element you should use XPATH starting with a dot .
Your locators like //div/div/div/table/tbody/tr[1]/td are absolute while they should be calculated based on the parent box element.
No need to define td element, you can use the existing box element here.
Locators like //div/div/div/table/tbody/tr[1]/td can and should be improved.
You probably will need to scroll to boxes while iterating over them.
I think company = {} should be defined inside the loop.
This should work better:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
import time
from webdriver_manager.chrome import ChromeDriverManager
driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()))
my_url= "https://monentreprise.bj/page/annonces"
driver.get(my_url)
wait = WebDriverWait(driver, 20)
actions = ActionChains(driver)
wait.until(EC.presence_of_element_located((By.CLASS_NAME, "col-md-4")))
time.sleep(2)
boxes = driver.find_elements(By.CLASS_NAME, "col-md-4")
companies = []
for box in boxes:
actions.move_to_element(box).perform()
time.sleep(0.3)
company = {}
header = box.find_element(By.XPATH,".//h5[#class='text-primary text-uppercase']")
company['name']= header.text
company['Objet']= box.find_element(By.XPATH,".//tr[1]/td").text
company['Capital']= box.find_element(By.XPATH,".//tr[2]/td").text
company['Siège Social'] = box.find_element(By.XPATH,".//tr[3]/td").text
company['Gérant'] = box.find_element(By.XPATH,".//tr[4]/td").text
company['Co-Gérant'] = box.find_element(By.XPATH,".//tr[5]/td").text
company['Durée'] = box.find_element(By.XPATH,".//tr[6]/td").text
company['Dépôt'] = box.find_element(By.XPATH,".//tr[7]/td").text
company['Immatriculation RCCM'] = box.find_element(By.XPATH,".//tr[8]/td").text
companies.append(company)
print(company)

Related

How to get Instagram Number of Post, Number of Followers, and Number of Following using Selenium?

Firstly I'm sorry for my poor Englih. I'm kinda new to Python. So, I would like to know on how to scrape instagram number of post, number of followers, and number of following for certain account (I try to loop at it) and store the data in CSV files.
I've been trying to figure it out the XPATH, but I thought that my XPATH already correct, so what did I miss??
Here are my code:
import selenium
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome import service
from selenium.webdriver.common.keys import Keys
import time
import wget
import os
import pandas as pd
import matplotlib.pyplot as plt
from selenium.webdriver.chrome.service import Service
urls = [
'https://www.instagram.com/acc_1/',
'https://www.instagram.com/acc_2/',
'https://www.instagram.com/acc_3/',
'https://www.instagram.com/acc_4/',
'https://www.instagram.com/acc_5/',
'https://www.instagram.com/acc_6/',
'https://www.instagram.com/acc_7/',
'https://www.instagram.com/acc_8/',
'https://www.instagram.com/acc_9/',
'https://www.instagram.com/acc_10/',
'https://www.instagram.com/acc_11/',
'https://www.instagram.com/acc_12/',
'https://www.instagram.com/acc_13/',
'https://www.instagram.com/acc_14/'
]
username_channel = []
number_of_post_chan = []
followers_chan = []
followings_chan = []
description_chan = []
#langsung buka
#collecting_data
for url in urls:
PATH = 'C:\webdrivers\chromedriver.exe.'
driver = webdriver.Chrome(PATH)
driver.get(url)
#driver.maximize_window()
driver.implicitly_wait(10)
#log-in
login = driver.find_element(By.XPATH, "//input[#name='username']")
login.clear()
login.send_keys('xxxxx')
driver.implicitly_wait(5)
login_pass = driver.find_element(By.XPATH, "//input[#name='password']")
login_pass.clear()
login_pass.send_keys('xxxxx')
driver.implicitly_wait(5)
button_login = driver.find_element(By.XPATH, "//form[#id='loginForm']/div/div[3]/button/div")
button_login.click()
time.sleep(3)
#Save Your Login info?
login_info = driver.find_element(By.XPATH, "//div[#class='cmbtv']/button")
login_info.click()
time.sleep(10)
driver.implicitly_wait(5)
usernameChan = driver.find_element(By.XPATH, "//h2[#class='_aacl _aacs _aact _aacx _aada']").text
numb_of_post = driver.find_element(By.CSS_SELECTOR, "//ul[#class=' _aa_8']/li[1]/div/span").text
followers = driver.find_element(By.XPATH, "//ul[#class=' _aa_8']/li[2]/a/div/span").get_attribute('title')
followings = driver.find_element(By.XPATH, "//ul[#class=' _aa_8']/li[3]/a/div/span").text
description = driver.find_element(By.XPATH, "//div[#class='_aa_c']/div").text
#username_channel.append(usernameChan)
#number_of_post_chan.append(numb_of_post)
#followers_chan.append(followers)
#followings_chan.append(followings)
#description_chan.append(description)
print(username_channel, number_of_post_chan, followers_chan, followings_chan, description_chan)
account_items = {
"username_ig" : username_channel,
"jumlah_posting" : number_of_post_chan,
"followers" : followers_chan,
"followings" : followings_chan,
"deskripsi" : description_chan
}
driver.quit()
df = pd.DataFrame(account_items, columns=["username_ig", "jumlah_posting", "followers", "followings", "deskripsi"])
print(df)
Is there any way better to express the element? Heeelp.
Thank you in advance.
To get the username, number of posts, followers, following and description you can select the element using CSS_SELECTOR.
In your code after the third driver.implicitly_wait(5) statement, instead of the next 5lines you can add the following.
usernameChan = driver.find_element(By.CSS_SELECTOR,"h2._aacl._aacs._aact._aacx._aada").text
details = driver.find_elements(By.CSS_SELECTOR, "span._ac2a._ac2b")
numb_of_post = details[0].text
followers = details[1].text
followings = details[2].text
description = driver.find_element(By.CSS_SELECTOR, "div._aacl._aaco._aacu._aacx._aad6._aade").text
EDIT : As you said, you got error while fetching details above IndexError: list index out of range. This probably is because the element might not have loaded until now. With the below imports replace the line where we are fetching details with the details in below code.
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
details = WebDriverWait(browser, 10).until(EC.visibility_of_element_located((By.CSS_SELECTOR, "span._ac2a._ac2b")))
The problem there is that the selector depends on whether the window is expanded or not

How can we loop through items that are coordinates, load each into a Google Maps page, and grab an element from each page?

I'm testing the code below, which should loop through items in a column of a dataframe, load each into Google Maps, and grab a 'title' from each page. I want to push the result into a column in a dataframe (df_fin['my_place'] = driver.title).
The problem that I am noticing is that the first record works perfectly fine, but no subsequent items work. I'm guessing something is wrong with the page refresh, or maybe the click event, but I don't know for sure what's going on here.
import pandas as pd
df_fin = pd.read_csv('C:\\Users\\df_fin.csv')
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
from selenium import webdriver
import time
driver = webdriver.Chrome('C:\\Users\\TryMe\\chromedriver.exe')
driver.maximize_window()
driver.implicitly_wait(5)
i = 1
for item in df_fin['place']:
try:
driver.get(item)
wait = WebDriverWait(driver, 10)
main_canvas = wait.until(EC.element_to_be_clickable((By.XPATH, "//*[name()='canvas']")))
size = main_canvas.size
w, h = size['width'], size['height']
new_w = w/2
new_h = h/2
ActionChains(driver).move_by_offset(new_h, new_h).pause(5).perform()
time.sleep(2)
wait.until(EC.element_to_be_clickable((By.XPATH, "//*[name()='canvas']"))).click()
print(driver.title)
df_fin['my_place'] = driver.title
except:
df_fin['my_place'] = 'OTHER'
i = i + 1
print(i)
df_fin.head(10)
Here is a sample of what's in my dataframe.
https://www.google.com/maps/#42.33988,-71.10409,18z
https://www.google.com/maps/#39.73914,-75.54937,18z
https://www.google.com/maps/#44.4995,-88.05496,18z
https://www.google.com/maps/#44.50235,-88.06322,18z
https://www.google.com/maps/#40.82265,-73.40959,18z
Finally, in the image below, you can see that 'title' is 'Rod Laver Arena'. That's what I'm trying to get.

Python Selenium Path not found

I have wrote the code
import os
from webdriver_manager.chrome import ChromeDriverManager
import time
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
options = Options()
options.add_argument('--ignore-certificate-errors')
options.add_argument('--start-maximized')
options.page_load_strategy = 'eager'
driver = webdriver.Chrome(options=options)
url = "https://www.moneycontrol.com/india/stockpricequote/chemicals/tatachemicals/TC"
driver.get(url)
try:
wait = WebDriverWait(driver, 10)
except Exception:
driver.send_keys(Keys.CONTROL +'Escape')
driver.find_element_by_link_text("Bonus").click()
try:
wait = WebDriverWait(driver, 5)
except Exception:
driver.send_keys(Keys.CONTROL +'Escape')
for i in range(0, 50):
bonus_month = driver.find_element_by_xpath ("//*[#class= 'mctable1.thborder.frtab']/tbody/tr[%s]/td[1]"%(i))
print(bonus_month.text)
bonus = driver.find_element_by_xpath ("//*[#class= 'mctable1.thborder.frtab']/tbody/tr[%s]/td[1]"%(i))
print(bonus.text)
This gives me error
no such element: Unable to locate element: {"method":"xpath","selector":"//*[#class= 'mctable1.thborder.frtab']/tbody/tr[0]/td[1]"}
Element on the page:
Where I am making mistake in finding Exbonus and Ratio?
First use the clickable method from the expected conditions to check that the element is clickable within given time to just make sure it is operational.
Once the click action performed on the bonus link the table takes some time to finishes loading. In meantime selenium tries to fetch the table content and fails to get it. So again add wait for the element to load and then grab the table using Xpath and iterate over the rows of the table. -
WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, "//ul[#id='corporation_tab']//a[#href='#ca_bonus']")))
driver.find_element_by_link_text("Bonus").click()
WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, "//tbody[#id='id_b']/tr")))
tableRows = driver.find_elements_by_xpath('//tbody[#id="id_b"]/tr')
print(tableRows)
for i in tableRows:
AnnouncementDate = i.find_element_by_xpath('td[1]').text
exbonus = i.find_element_by_xpath('td[2]').text
ratio = i.find_element_by_xpath('td[3]').text
print(AnnouncementDate + " \t\t " + exbonus + " \t\t " + ratio)
This returns me the output -
You will need following extra import -
from selenium.webdriver.support import expected_conditions as EC
This partially will solve your issue with locators:
1 To find Ex-Bonus use css selector: #id_b>tr>td:nth-of-type(2)
2 To find ratio use also css selector, #id_b>tr>td:nth-of-type(3)
To iterante use:
#id_b>tr:nth-of-type(x)>td:nth-of-type(3)
where x is the number of row.
For example, #id_b>tr:nth-of-type(1)>td:nth-of-type(3) will give you text with ratio 3:5
If you avoid avoid using #id_b, this locator will not be unique.
Instead of range function I'd use find_elements_by_css_selector. Try following:
rows = driver.find_elements_by_css_selector("#id_b>tr")
for row in rows:
bonus = row.find_element_by_css_selector("td:nth-of-type(2)").text
ratio = row.find_element_by_css_selector("td:nth-of-type(3)").text
There are only 5 of this elements on the page. I'll have to click See More.

Extracting reviews from Google play store app website

import time
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
import pandas as pd
class FindByXpathCss():
# Declaring variables
Reviews = [] # List to store final set of reviews
reviewText = [] # List to store reviews extracted from XPath
reviewFullText = []
# Chromedriver path
driver = webdriver.Chrome(executable_path=r"F:\Chrome-webdriver\chromedriver.exe")
driver.maximize_window()
baseUrl = "https://play.google.com/store/apps/details?id=com.delta.mobile.android&hl=en_US&showAllReviews=true"
driver.get(baseUrl)
# driver.execute_script("scrollBy(0,300);")
# Scrolling down
for i in range(20):
driver.find_element_by_xpath('//*[#id="yDmH0d"]').send_keys(Keys.ARROW_DOWN, i)
time.sleep(0.5)
# To click on Show more button
#btnShowMore = driver.find_element_by_xpath('//*[#id="fcxH9b"]/div[4]/c-wiz/div/div[2]''/div/div[1]/div/div/div[1]/div[2]/div[2]/div/span/span').click()
# Scrolling to top
for j in range(10):
driver.find_element_by_xpath('//*[#id="yDmH0d"]').send_keys(Keys.ARROW_UP, j)
#for i in range(10):
review_btn = driver.find_elements_by_xpath("//button[contains(#class,'')][contains(text(),'Full Review')]")
single_review_btn = driver.find_element_by_xpath("//button[contains(#class,'')][contains(text(),'Full Review')]")
#time.sleep(1)
The div html tag having 2 tags, one is having jsname as 'fbQN7e' which is there for holding the bigger reviews and those reviews will have button called "Full Review". Another one span within the same div html tag is 'bN97Pc' which is there to hold smaller reviews which wont have 'Full review' button at the end of this review. I couldn't get reviews of both types of span. Here I tried to write reviewFullText list directly to dataframe, but getting only element datatype, not text. I don't know why this too happening.
for btn in review_btn:
btn.click()
reviewFullText = driver.find_elements_by_css_selector("span[jsname='fbQN7e']")
#if(single_review_btn.is_enabled()==False):
#reviewText = driver.find_elements_by_css_selector("span[jsname=\"bN97Pc\"]")
##else:
#pass
# Iterating each reviews and appending into list Reviews
for txtreview in reviewText:
reviewFullText.append(txtreview.text)
print(len(reviewFullText))
# Writing the list values into csv file
df = pd.DataFrame(reviewFullText)
#df = pd.DataFrame({'Reviews': 'Reviews'}) #'Sentiment': 'null'})
df.to_csv('Reviews.csv', index=True, encoding='utf-8')
driver.close()
I have modified your solution to retrieve all review from the page.
import time
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
class FindByXpathCss():
driver = webdriver.Chrome(executable_path=r"C:\New folder\chromedriver.exe")
driver.maximize_window()
baseUrl = "https://play.google.com/store/apps/details?id=com.delta.mobile.android&hl=en_US&showAllReviews=true"
driver.get(baseUrl)
scrolls = 3
while True:
scrolls -= 1
driver.execute_script("window.scrollTo(0, document.body.scrollHeight)")
time.sleep(3)
if scrolls < 0:
break
buttonClick = WebDriverWait(driver, 30).until(
EC.visibility_of_all_elements_located((By.XPATH, "//button[contains(#class,'')][contains(text(),'Full Review')]")))
for element in buttonClick:
driver.execute_script("arguments[0].click();", element)
reviewText = WebDriverWait(driver, 30).until(
EC.presence_of_all_elements_located((By.XPATH, "//*[#class='UD7Dzf']")))
for textreview in reviewText:
print textreview.text
reviewText = WebDriverWait(driver, 30).until(
EC.presence_of_all_elements_located((By.XPATH, "//*[#class='UD7Dzf']")))
# reviewText = driver.find_elements_by_xpath("//*[#class='UD7Dzf']")
for textreview in reviewText:
print textreview.text
Output:

how to get product price from different size selenium python

I want to scrape product information from this page . This product have three different size and price will be change if I select different size from drop-down section. Right now my scraper only can scrape default price after first time initially page load which is 35 for 1kg. How I will scrape price for 500g and 250g. here is my code:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
#argument for incognito Chrome
#argument for incognito Chrome
option = Options()
option.add_argument("--incognito")
browser = webdriver.Chrome(options=option)
browser.get("https://boutique.cafebarista.ca/products/cremone?variant=18033418797121")
product_title = browser.find_element_by_xpath('//h1[#class="product-name"]')
long_description = browser.find_element_by_xpath('//div[#class="product-landing-container"]')
price=browser.find_element_by_xpath('//div[#class="product-btn-price ProductPrice"]')
print(product_title.text,long_description.text,price.text)
browser.quit()
With .find_elements_by_css_selector you can get each text without clicking the weight drop down first, this is the selector I mean:
nav[id="w-dropdown-list-16"] > a > div
And you can also click on each of these elements using .execute_script
Try following code:
driver.get('https://boutique.cafebarista.ca/products/autentico?variant=18033459331137')
weight_list = driver.find_elements_by_css_selector('nav[id="w-dropdown-list-16"] > a > div')
for weight in weight_list:
driver.execute_script('arguments[0].click();', weight)
price = driver.find_element_by_id('ProductPrice').text
print(weight.get_attribute('innerHTML') +' ' +price)
Try below solution
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium import webdriver
browser = webdriver.Chrome(executable_path=r"C:\New folder\chromedriver.exe")
browser.maximize_window()
wait = WebDriverWait(browser, 20)
browser.get("https://boutique.cafebarista.ca/products/cremone?variant=18033418797121")
kg_button=browser.find_element_by_xpath("//div[#id='w-dropdown-toggle-16']")
kg_button.click()
list =wait.until(EC.presence_of_all_elements_located((By.XPATH, "//nav[#id='w-dropdown-list-16']//a")))
kg_button.click()
for element in list:
kg_button.click()
actionChains = ActionChains(browser)
actionChains.move_to_element(element).click().perform()
price = browser.find_element_by_xpath("//div[#id='ProductPrice']")
print product_title.text
print element.text
print price.text
browser.quit()

Categories