Web scraping Amazon and create a list of dic - python

I am experiencing some troubles with my code when scraping the Amazon site with selenium.
I want a list of dictionaries of title and author of the books as keys and values, in the format:
[{TITLE:'x', AUTHOR:'y'}
{TITLE:'z', AUTHOR:'w'}]
However it returns me a dictionary of lists, with keys and values repeated, in the format:
{TITLE:['x'], AUTHOR:['y']}
{TITLE:['x', 'z'], AUTHOR:['y', 'r']}
{TITLE:['x', 'z', 'q'], AUTHOR:['y', 'r', 'p']}
That is: it iterates and repeat the values for each key. It shows me the previous value, and includes it in the next dictionary. It is not supposed to happen. What am I doing wrong?
Here is my code:
Firstly, I import the libraries:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from time import sleep
Secondly, I install the proper version o chromedrive:
Service(ChromeDriverManager().install())
Thirdly, I open the browser automaticaly:
options = Options()
options.add_argument('window-size=250,1000')
driver = webdriver.Chrome(executable_path=r'C:\Users\dambr\Documents\scrapping\chromedriver.exe', options=options)
driver.implicitly_wait(5)
Fourtly, I open the Amazon site:
driver.get('https://www.amazon.com.br/')
a = driver.find_element(By.ID, "twotabsearchtextbox")
a.click()
a.send_keys('python')
b = driver.find_element(By.ID, "nav-search-submit-button")
b.click()
sleep(3)
Finally, I take all the titles and authors of my search and try to store in a list of dictionaries:
dic_livros = {'TÍTULO':[], 'AUTOR':[]}
lista = '//*[#id="search"]/div[1]/div[1]/div/span[1]'
for i in lista:
title = driver.find_elements(By.XPATH, "//span[#class='a-size-base-plus a-color-base a-text-normal']")
author = driver.find_elements(By.XPATH, "//span[#class='a-size-base']")
for (each_title, each_author) in zip(title, author):
dic_livros['TÍTULO'].append(each_title.text)
dic_livros['AUTOR'].append(each_author.text)
print(dic_livros)
Where, precisely is my mistake?
Here is what my output looks like:

your last step needs two changes:
replace first line with
dic_livros = []
then for the for loop:
for (each_title, each_author) in zip(title, author):
dic_livros.append({'Titulo':each_title.text,'Autor':each_author.text})

Related

How to get Instagram Number of Post, Number of Followers, and Number of Following using Selenium?

Firstly I'm sorry for my poor Englih. I'm kinda new to Python. So, I would like to know on how to scrape instagram number of post, number of followers, and number of following for certain account (I try to loop at it) and store the data in CSV files.
I've been trying to figure it out the XPATH, but I thought that my XPATH already correct, so what did I miss??
Here are my code:
import selenium
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome import service
from selenium.webdriver.common.keys import Keys
import time
import wget
import os
import pandas as pd
import matplotlib.pyplot as plt
from selenium.webdriver.chrome.service import Service
urls = [
'https://www.instagram.com/acc_1/',
'https://www.instagram.com/acc_2/',
'https://www.instagram.com/acc_3/',
'https://www.instagram.com/acc_4/',
'https://www.instagram.com/acc_5/',
'https://www.instagram.com/acc_6/',
'https://www.instagram.com/acc_7/',
'https://www.instagram.com/acc_8/',
'https://www.instagram.com/acc_9/',
'https://www.instagram.com/acc_10/',
'https://www.instagram.com/acc_11/',
'https://www.instagram.com/acc_12/',
'https://www.instagram.com/acc_13/',
'https://www.instagram.com/acc_14/'
]
username_channel = []
number_of_post_chan = []
followers_chan = []
followings_chan = []
description_chan = []
#langsung buka
#collecting_data
for url in urls:
PATH = 'C:\webdrivers\chromedriver.exe.'
driver = webdriver.Chrome(PATH)
driver.get(url)
#driver.maximize_window()
driver.implicitly_wait(10)
#log-in
login = driver.find_element(By.XPATH, "//input[#name='username']")
login.clear()
login.send_keys('xxxxx')
driver.implicitly_wait(5)
login_pass = driver.find_element(By.XPATH, "//input[#name='password']")
login_pass.clear()
login_pass.send_keys('xxxxx')
driver.implicitly_wait(5)
button_login = driver.find_element(By.XPATH, "//form[#id='loginForm']/div/div[3]/button/div")
button_login.click()
time.sleep(3)
#Save Your Login info?
login_info = driver.find_element(By.XPATH, "//div[#class='cmbtv']/button")
login_info.click()
time.sleep(10)
driver.implicitly_wait(5)
usernameChan = driver.find_element(By.XPATH, "//h2[#class='_aacl _aacs _aact _aacx _aada']").text
numb_of_post = driver.find_element(By.CSS_SELECTOR, "//ul[#class=' _aa_8']/li[1]/div/span").text
followers = driver.find_element(By.XPATH, "//ul[#class=' _aa_8']/li[2]/a/div/span").get_attribute('title')
followings = driver.find_element(By.XPATH, "//ul[#class=' _aa_8']/li[3]/a/div/span").text
description = driver.find_element(By.XPATH, "//div[#class='_aa_c']/div").text
#username_channel.append(usernameChan)
#number_of_post_chan.append(numb_of_post)
#followers_chan.append(followers)
#followings_chan.append(followings)
#description_chan.append(description)
print(username_channel, number_of_post_chan, followers_chan, followings_chan, description_chan)
account_items = {
"username_ig" : username_channel,
"jumlah_posting" : number_of_post_chan,
"followers" : followers_chan,
"followings" : followings_chan,
"deskripsi" : description_chan
}
driver.quit()
df = pd.DataFrame(account_items, columns=["username_ig", "jumlah_posting", "followers", "followings", "deskripsi"])
print(df)
Is there any way better to express the element? Heeelp.
Thank you in advance.
To get the username, number of posts, followers, following and description you can select the element using CSS_SELECTOR.
In your code after the third driver.implicitly_wait(5) statement, instead of the next 5lines you can add the following.
usernameChan = driver.find_element(By.CSS_SELECTOR,"h2._aacl._aacs._aact._aacx._aada").text
details = driver.find_elements(By.CSS_SELECTOR, "span._ac2a._ac2b")
numb_of_post = details[0].text
followers = details[1].text
followings = details[2].text
description = driver.find_element(By.CSS_SELECTOR, "div._aacl._aaco._aacu._aacx._aad6._aade").text
EDIT : As you said, you got error while fetching details above IndexError: list index out of range. This probably is because the element might not have loaded until now. With the below imports replace the line where we are fetching details with the details in below code.
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
details = WebDriverWait(browser, 10).until(EC.visibility_of_element_located((By.CSS_SELECTOR, "span._ac2a._ac2b")))
The problem there is that the selector depends on whether the window is expanded or not

How to move to the next enclosing(div) while scraping a site?

All the data in is populated from the first table. I cannot move to the next div and get the data of the td for each tr.
The site: https://asd.com/page/
Below is the code that I have written.
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()))
my_url= "https://asd.com/page/asd"
driver.get(my_url)
boxes = driver.find_elements(By.CLASS_NAME, "col-md-4")
companies = []
company = {}
for box in boxes:
header = box.find_element(By.CLASS_NAME,"text-primary.text-uppercase")
company['name']= header.text
td= box
company['Type']= td.find_element(By.XPATH,"//div/div/div/table/tbody/tr[1]/td").text
company['Capital']= td.find_element(By.XPATH,"//div/div/div/table/tbody/tr[2]/td").text
company['Address'] = td.find_element(By.XPATH,"//div/div/div/table/tbody/tr[3]/td").text
company['Owner'] = td.find_element(By.XPATH,"//div/div/div/table/tbody/tr[4]/td").text
company['Co-Owner'] = td.find_element(By.XPATH,"//div/div/div/table/tbody/tr[5]/td").text
company['Duration'] = td.find_element(By.XPATH,"//div/div/div/table/tbody/tr[6]/td").text
company['Place'] = td.find_element(By.XPATH,"//div/div/div/table/tbody/tr[7]/td").text
company['Company ID'] = td.find_element(By.XPATH,"//div/div/div/table/tbody/tr[8]/td").text
companies.append(company)
print(company)
There are several issues here:
You need to add some delay between driver.get(my_url) and boxes = driver.find_elements(By.CLASS_NAME, "col-md-4") to let the elements loaded before getting the list of all of them.
text-primary.text-uppercase is actually 2 class names: text-primary and text-uppercase so you should use XPATH or CSS_SELECTOR to locate element by 2 class names, not by CLASS_NAME.
In order to locate elements inside another element you should use XPATH starting with a dot .
Your locators like //div/div/div/table/tbody/tr[1]/td are absolute while they should be calculated based on the parent box element.
No need to define td element, you can use the existing box element here.
Locators like //div/div/div/table/tbody/tr[1]/td can and should be improved.
You probably will need to scroll to boxes while iterating over them.
I think company = {} should be defined inside the loop.
This should work better:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
import time
from webdriver_manager.chrome import ChromeDriverManager
driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()))
my_url= "https://monentreprise.bj/page/annonces"
driver.get(my_url)
wait = WebDriverWait(driver, 20)
actions = ActionChains(driver)
wait.until(EC.presence_of_element_located((By.CLASS_NAME, "col-md-4")))
time.sleep(2)
boxes = driver.find_elements(By.CLASS_NAME, "col-md-4")
companies = []
for box in boxes:
actions.move_to_element(box).perform()
time.sleep(0.3)
company = {}
header = box.find_element(By.XPATH,".//h5[#class='text-primary text-uppercase']")
company['name']= header.text
company['Objet']= box.find_element(By.XPATH,".//tr[1]/td").text
company['Capital']= box.find_element(By.XPATH,".//tr[2]/td").text
company['Siège Social'] = box.find_element(By.XPATH,".//tr[3]/td").text
company['Gérant'] = box.find_element(By.XPATH,".//tr[4]/td").text
company['Co-Gérant'] = box.find_element(By.XPATH,".//tr[5]/td").text
company['Durée'] = box.find_element(By.XPATH,".//tr[6]/td").text
company['Dépôt'] = box.find_element(By.XPATH,".//tr[7]/td").text
company['Immatriculation RCCM'] = box.find_element(By.XPATH,".//tr[8]/td").text
companies.append(company)
print(company)

youtube comment number scraping with youtube

in my project i have tried to scrape youtube viewers number, comment number , likes and dislikes numbers. I cant take comments number i have tried different methods but nothing change. here is my code please help me :
import selenium
from selenium import webdriver
import pandas as pd
import time
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
#we choose our browser chromedriver must be in the path
driver = webdriver.Chrome()
#we need data to save variables
data = {'Likes' : [], 'Dislikes' : [], 'Comments' : [], 'Views' : []}
dataframe = pd.DataFrame(data)
# we get the link
driver.get("https://www.youtube.com/watch?v=fHI8X4OXluQ")
# we wait for opening the link
time.sleep(5)
# we find element by xpatch which means manually
Likes = driver.find_element_by_xpath('/html/body/ytd-app/div/ytd-page-manager/ytd-watch-
flexy/div[5]/div[1]/div/div[8]/div[2]/ytd-video-primary-info-
renderer/div/div/div[3]/div/ytdmenu-renderer/div[2]/ytd-toggle-button-renderer[1]/a/yt-
formatted-string').text
Dislikes = driver.find_element_by_xpath('/html/body/ytd-app/div/ytd-page-manager/ytd-watch-
flexy/div[5]/div[1]/div/div[8]/div[2]/ytd-video-primary-info-renderer/div/div/div[3]/div/ytd-
menu-renderer/div[2]/ytd-toggle-button-renderer[2]/a/yt-formatted-string').text
View = driver.find_elements_by_xpath('//div[#id="count"]')
Comments=driver.find_elements_by_xpath('/html/body/ytd-app/div/ytd-page-manager/ytd-watch-
flexy/div[5]/div[1]/div/ytd-comments/ytd-item-section-renderer/div[1]/ytd-comments-header-
renderer/div[1]/h2/yt-formatted-string/span[1]')
print(Likes)
print(Dislikes)
print(View[1].text)
print(Comments)
driver.quit()
Basically something like this should work
import selenium
from selenium import webdriver
import pandas as pd
import time
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
#we choose our browser chromedriver must be in the path
driver = webdriver.Chrome()
#we need data to save variables
data = {'Likes' : [], 'Dislikes' : [], 'Comments' : [], 'Views' : []}
dataframe = pd.DataFrame(data)
# we get the link
driver.get("https://www.youtube.com/watch?v=fHI8X4OXluQ")
# we wait for opening the link
time.sleep(5)
likes_xpath = '(//div[#id="top-level-buttons-computed"]//*[contains(#aria-label," likes")])[last()]'
# we find element by xpatch which means manually
Likes = driver.find_element_by_xpath(likes_xpath).text
dislikes_xpath = '//div[#id="top-level-buttons-computed"]//*[contains(#aria-label," dislikes")]'
Dislikes = driver.find_element_by_xpath(dislikes_xpath).text
views_xpath = '//*[name()="ytd-video-view-count-renderer"]/span[#class="view-count style-scope ytd-video-view-count-renderer"]'
View = driver.find_elements_by_xpath(views_xpath)
comments_xpath = '//*[name()="ytd-comment-renderer"]//*[name()="yt-formatted-string" and #id="content-text"]'
Comments=driver.find_elements_by_xpath(comments_xpath)
print(Likes)
print(Dislikes)
print(View[1].text)
print(Comments)
driver.quit()
However there are a lot of comments there, so in order to get all of them you will have to scroll this page
See if this works for comments count:-
elem = driver.find_element_by_xpath(".//div[#class='style-scope ytd-comments-header-renderer' and #id='title']//following-sibling::yt-formatted-string[contains(#class,'ytd-comments-header-renderer')]/span[1]")
driver.execute_script("arguments[0].scrollIntoView();", elem)
elem.text

Using Selenium css selector to extract data

Hello I did this code that returns to me a list of li , but I want to access to each a tag mentioned inside and open it , if you have any recommandation I would be very grateful
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
import pandas as pd
import time
options = Options()
# Creating our dictionary
all_services = pd.DataFrame(columns=['Motif', 'Description'])
path = "C:/Users/Al4D1N/Documents/ChromeDriver_webscraping/chromedriver.exe"
driver = webdriver.Chrome(options=options, executable_path=path)
driver.get("https://www.mairie.net/national/acte-naissance.htm#plus")
list_of_services = driver.find_elements_by_css_selector(".list-images li")
I know that I need to iterate in each list_of_services Item , but I don't know how can I open each a tag since they all don't have classes or ids that can help me to make difference between them
This is one way to extract all of the links within the hrefs.
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
import pandas as pd
import time
options = Options()
# Creating our dictionary
all_services = pd.DataFrame(columns=['Motif', 'Description'])
path = "C:/Users/Al4D1N/Documents/ChromeDriver_webscraping/chromedriver.exe"
driver = webdriver.Chrome(options=options, executable_path=path)
driver.get("https://www.mairie.net/national/acte-naissance.htm#plus")
#Get all elements in class 'list-images'
list_of_services = driver.find_elements_by_class_name("list-images")
for service in list_of_services:
#In each element, select the atags
atags = service.find_elements_by_css_selector('a')
for atag in atags:
#In each atag, select the href
href = atag.get_attribute('href')
Output:
https://www.mairie.net/national/acte-mariage.htm#acte-naissance
https://www.mairie.net/national/acte-deces.htm#acte-naissance
https://www.mairie.net/national/carte-identite.htm#acte-naissance
https://www.mairie.net/national/passeport.htm#acte-naissance
https://www.mairie.net/national/casier-judiciaire.htm#acte-naissance
https://www.mairie.net/national/demande-carte-electorale.htm#acte-naissance
https://www.mairie.net/national/cadastre-plu.htm#acte-naissance
https://www.mairie.net/national/carte-grise-en-ligne-par-internet.htm#acte-naissance
https://www.mairie.net/national/certificat-non-gage.htm#acte-naissance
https://www.mairie.net/national/permis-conduire-delivrance.htm#acte-naissance
https://www.mairie.net/national/changement-adresse.htm#acte-naissance

Find by xpath for multiple td tags

I would like to download PDF files from this website https://www.asx.com.au/asx/statistics/prevBusDayAnns.do if two conditions are met. The first condition is that the 'ASX Code' has to match one of the codes in a list. The second condition is that the 'Headline' has to match 'Change in substantial holding'. My current code only finds by xpath if the 'ASX Code' = 'SPL'.
An example of what I am trying to achieve:
data1 = ['SPL', 'WBC', 'AAA']
WebDriverWait(driver,20).until(EC.element_to_be_clickable((By.XPATH,"//table//tr//td[text()={data1}]/following-sibling::td[3]/a"))).click()
My code:
chromeOptions=webdriver.ChromeOptions()
prefs = {"plugins.always_open_pdf_externally": True}
chromeOptions.add_experimental_option("prefs",prefs)
driver=webdriver.Chrome(executable_path=r"C:\Users\Harrison Pollock\Downloads\Python\chromedriver_win32\chromedriver.exe",chrome_options=chromeOptions)
driver.get("https://www.asx.com.au/asx/statistics/prevBusDayAnns.do")
WebDriverWait(driver,20).until(EC.element_to_be_clickable((By.XPATH,"//table//tr//td[text()='SPL']/following-sibling::td[3]/a"))).click()
WebDriverWait(driver,15).until(EC.number_of_windows_to_be(2))
driver.switch_to.window(driver.window_handles[-1])
WebDriverWait(driver,15).until(EC.element_to_be_clickable((By.XPATH,"//input[#value='Agree and proceed']"))).click()
I couldn't find the data set with ASX code data1 = ['SPL', 'WBC', 'AAA'] on web page at my location.However here is the example how to down load multiple ASX code in a sequence.
Dataset: data1 = ['SW1', 'AME', 'BGA','PPT','AMP']
Store href value of the in a list and then iterate the list and click on Agreed button to download the pdf.
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium import webdriver
import time
chromeOptions=webdriver.ChromeOptions()
prefs = {"plugins.always_open_pdf_externally": True}
chromeOptions.add_experimental_option("prefs",prefs)
driver=webdriver.Chrome(executable_path=r"C:\Users\Harrison Pollock\Downloads\Python\chromedriver_win32\chromedriver.exe",chrome_options=chromeOptions)
driver.get("https://www.asx.com.au/asx/statistics/prevBusDayAnns.do")
data1 = ['SW1', 'AME', 'BGA','PPT','AMP']
pdfUrls=[]
for d in data1:
try:
pdfurl=driver.find_element_by_xpath("//table//tr//td[text()='{}']/following-sibling::td[3]/a[contains(.,'{}')]".format(d,"Change in substantial holding")).get_attribute("href")
pdfUrls.append(pdfurl)
except:
print("No ASX code found with Headline Change in substantial holding : " + d)
for pdfurl in pdfUrls:
driver.get(pdfurl)
WebDriverWait(driver, 15).until(EC.element_to_be_clickable((By.XPATH, "//input[#value='Agree and proceed']"))).click()
time.sleep(10) # pause to check download
print("Downloaded pdf file")
Strictly speaking, the XPath would be :
//table//tr[./td[1][.="SPL" or .="WBC" or .="AAA"] and .//a/text()[1][contains(.,"Change in substantial holding")]]//#href
Working example (16/04/2020) with another codes :
//table//tr[./td[1][.="AME" or .="SW1" or .="WEB"] and .//a/text()[1][contains(.,"Change in substantial holding")]]//#href
In Python :
values = ['SPL', 'WBC', 'AAA']
response.xpath('//table//tr[./td[1][.="' + values[0] + '"] and .//a/text()[1][contains(.,"Change in substantial holding")]]//#href').extract()

Categories