I am studying Naver web crawling in Korea.
In the code below, I am extracting market information in the while statement.
There's a problem. If you check Developer Tools with F12, you have to click the "[More same products]" tab to reveal hidden products.
A total of 24 elements need to be counted, but the code below can only crawl 20.
Look at the variables stored in the market and ask for a lot of answers on how to handle them.
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
import time
url = 'https://search.shopping.naver.com/catalog/10555224834?query=c922%20pro&NaPm=ct%3Dl7ebx6qw%7Cci%3D1bf003bb08f09f0f6b4a7c0713e56125c0984e64%7Ctr%3Dslsl%7Csn%3D95694%7Chk%3D5e07d721000d98f35225c07de4fb325fbd2a76c1'
url2 = 'https://search.shopping.naver.com/catalog/21052026769?query=clp-609&NaPm=ct%3Dl7ee8y6g%7Cci%3D030b02f64d9acedb523928f8f2258d16a2883a84%7Ctr%3Dslsl%7Csn%3D95694%7Chk%3D44d698f0e32b84eb243e7a84f624b5d54c5379d2'
driver = webdriver.Chrome(ChromeDriverManager().install())
browser = driver.get(url)
time.sleep(2)
product_section = driver.find_element(By.CSS_SELECTOR,'div.product_section_price__b6yrx') # 상세구입조건 / 상품 db / 버튼을 모두 가지고 있는 div태그
# 상품옵션 선택
option_dic = {}
ul = product_section.find_elements(By.CSS_SELECTOR,'div.filter_condition_group__h8Gss > ul')
for lis in ul:
labels = lis.find_elements(By.CSS_SELECTOR,'label.filter_label__3GLbR')
for label in labels:
option_text = label.find_element(By.CSS_SELECTOR,'span.filter_text__J8EIh').text
option_dic[option_text]=label
if len(option_dic) > 1:
try:
comment = input (f'상세구입조건을 선택하세요! {option_dic.keys()} :')
option_dic[comment].click()
time.sleep(5)
except Exception as e:
print('오류가 발생했습니다. 다시 입력하세요!',str(e))
# 총 상품수
ul = driver.find_element(By.CSS_SELECTOR,'div.floatingTab_detail_tab__akl87 > ul')
lis = ul.find_elements(By.CSS_SELECTOR,'li')
if lis:
item_count = lis[0].find_element(By.CSS_SELECTOR,'a > em').text
time.sleep(3)
# 상품 크롤링
naver_item = []
page = 1
while True:
print('-'*50+f'{page}페이지 진행중'+'-'*50)
items = product_section.find_elements(By.CSS_SELECTOR,'ul.productList_list_seller__XGhCk > li')
for index, item in enumerate(items):
same_item = item.find_elements(By.CSS_SELECTOR,'div.productList_same_area__ULPvk')
if same_item:
for a_button in same_item:
button = a_button.find_element(By.CSS_SELECTOR,'a.productList_same__0QQHk')
button.click()
time.sleep(5)
## 판매처 market ##
imgs = item.find_elements(By.CSS_SELECTOR,'img')
if imgs:
for img in imgs:
market = item.find_element(By.CSS_SELECTOR,'img').get_attribute('alt')
# if imgs:
# for img in imgs:
# market = item.find_element(By.CSS_SELECTOR,'img').get_attribute('alt')
else:
market = item.find_element(By.CSS_SELECTOR,'a.productList_mall_link__TrYxC > span').text
print(market)
# if img:
# market = item.find_element(By.CSS_SELECTOR,'img').get_attribute('alt')
# else:
# market = item.find_element(By.CSS_SELECTOR,'a.productList_mall_link__TrYxC > span').text
# ## 상품명 ##
# name = item.find_element(By.CSS_SELECTOR,'a.productList_title__R1qZP').text
# ## 사이트 ##
# site = item.find_element(By.CSS_SELECTOR,'a.productList_title__R1qZP').get_attribute('href')
# ## 판매가 ##
# price_list = item.find_elements(By.CSS_SELECTOR,'a.productList_value__B_IxM > span')
# for em in price_list:
# price = em.find_element(By.CSS_SELECTOR,'em').text
# ## 배송비 ##
# delivery = item.find_element(By.CSS_SELECTOR,'div.productList_delivery__WwSwL').text
# item_dic = {"순위":index+1,"판매처":market,"상품명":name,"판매가":price,"배송비":delivery,"url":site}
# naver_item.append(item_dic)
print(len(naver_item))
print()
print(int(item_count))
print()
print('-'*50+f'{page}페이지 종료'+'-'*50)
if len(naver_item) >= int(item_count):
print('-'*50+'크롤링이 종료되었습니다.'+'-'*50)
break
# 다음 페이지 클릭
btn_dic = {}
a_tags = product_section.find_elements(By.CSS_SELECTOR,'div.pagination_pagination__JW7zT > a')
for index, a in enumerate(a_tags):
btn_dic[index]=a
btn_dic[page].click()
time.sleep(5)
page += 1
print(len(naver_item))
print(naver_item)
Related
anyone can help with scraping from https://www.whed.net/home.php
the code I'm using is giving me empty df. would love to have universities with websites and maybe field of study. My scraping skills are weak so if you can guide me through this would be great thanks guys.
begin=time.time()
countries=['Emirates','United States of America (all)']
result = [] # List to store all data
univ_links=[] # Links for all universities
fields = ['Street:','City:','Province:','Post Code:','WWW:','Fields of study:','Job title:']
webD = wb.Chrome(executable_path=r'C:\Users\Admin\OneDrive\Sagasit\chromedriver.exe') # To launch chrome and run script
# Trigger the target website
webD.get("https://www.whed.net/results_institutions.php")
webD.implicitly_wait(5)
#all_countries=[]
cntry_el = webD.find_elements_by_xpath('//*[#id="Chp1"]/option')
#cntry_grp = webD.find_elements_by_xpath('//*[#id="Chp1"]/optgroup')
grps=webD.find_elements_by_xpath('//*[#id="Chp1"]/optgroup/option[1]')
for c in cntry_el:countries.append(c.text)
for g in grps: countries.append(g.text)
for cntry in countries:
select = Select(webD.find_element_by_id('Chp1'))#select country dropdown
select.select_by_visible_text(cntry)#choosing country
Btn_GO = webD.find_element_by_xpath('//*[#id="fsearch"]/p/input')
Btn_GO.click()
select_rpp = Select(webD.find_element_by_name('nbr_ref_pge'))#select results per page drop down
select_rpp.select_by_visible_text('100')#choosing 100 results per page option
university_form = webD.find_element_by_xpath('//*[#id="contenu"]').find_element_by_id('results')
university_list = university_form.find_elements_by_xpath('//*[#id="results"]/li') # list of university elements
for univ in range(len(university_list)):
href = university_list[univ].find_element_by_class_name('details').find_elements_by_tag_name('a')[0].get_property('href') # University details link
univ_links.append(href)
while True:
try:
webD.find_element_by_partial_link_text('Next').click()
university_form = webD.find_element_by_xpath('//*[#id="contenu"]').find_element_by_id('results')
university_list = university_form.find_elements_by_xpath('//*[#id="results"]/li')
for univ in range(len(university_list)):
href = university_list[univ].find_element_by_class_name('details').find_elements_by_tag_name('a')[0].get_property('href') # University details link
univ_links.append(href)
except NoSuchElementException: break
for l in univ_links:
webD.get(l)
webD.implicitly_wait(2)
title=webD.find_element_by_xpath('//*[#id="page"]/div/div/div[2]/div[1]').text
title_detailed = webD.find_element_by_xpath('//*[#id="page"]/div/div/div[2]/div[2]').text
cntry_name=webD.find_element_by_xpath('//*[#id="contenu"]/p[2]').text
t1=webD.find_elements_by_class_name('dt')
t2=webD.find_elements_by_class_name('dd')
labels=webD.find_elements_by_class_name('libelle')
content=webD.find_elements_by_class_name('contenu')
temp={}
fos=''
fos1=''
temp.update({'Title': title,'Detailed Title':title_detailed,'Country':cntry_name})
for i in range(len(t1)):
if t1[i].text == '' or t1[i].text == 'Address':
continue
else:
value=t2[i].text
temp.update({t1[i].text:value.replace('\n',',')})
for j in range(len(content)):
if labels[j].text in fields:
if labels[j].text == 'Fields of study:':
info=content[j].text
fos=fos+','+info
elif labels[j].text == 'Job title:':
info1=content[j].text
fos1=fos1+','+info1
else:
key=labels[j].text
temp.update({key[:-1]: content[j].text})
temp.update({'Fields of study': fos.lstrip(','),'Job titles':fos1.lstrip(',')})
result.append(temp)
data=pd.DataFrame(result)
data
end=time.time()
print("Time taken : "+ str(end-begin) +"s")
data.to_csv("WHED1.csv",index=False)
this code what i could use taken from github project.
would be great if i can re-create the data and save it, want this to be used as a dropdown in a web application just to make sure no mistakes written in the university studied in.
Update 1/12/22 - Async
Found a much better solution using aiohttp, it also runs the entire list of countries in ~30 seconds instead of 3 hours
import json
import time
import aiohttp
import asyncio
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.remote.webelement import WebElement
from selenium.webdriver.support.select import Select
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
def main():
print("Init")
driver = init_driver()
print("Opening Homepage")
url = "https://www.whed.net/results_institutions.php"
driver.get(url)
time.sleep(1)
print("Gathering Countries")
countries = get_countries(driver)
driver.quit()
print("Scraping")
start = time.time()
institution_list = asyncio.run(fetch_all(countries))
print("Writing out")
f = open('output.json', 'w')
f.write(json.dumps(institution_list))
f.close()
end = time.time()
print(f"Total time: {end - start}s")
def init_driver():
chrome_executable = Service(executable_path='chromedriver.exe', log_path='NUL')
chrome_options = Options()
chrome_options.add_argument("--headless")
driver = webdriver.Chrome(service=chrome_executable, options=chrome_options)
return driver
def get_countries(driver):
select = Select(driver.find_element(By.ID, "Chp1"))
countries = list(map(lambda c: c.get_attribute('value'), select.options))
countries.pop(0)
return countries
def extract_institutions(html, country):
soup = BeautifulSoup(html, 'html.parser')
page = soup.find('p', {'class': 'infos'}).text
print(str(page))
number_of_institutions = str(page).split()[0]
if number_of_institutions == 'No':
print(f"No results for {country}")
return []
results = []
inst_index = 0
raw = soup.find_all('a', {'class': 'fancybox fancybox.iframe'})
for i in raw:
results.append({
'name': str(i.text).strip(),
'url': 'https://www.whed.net/' + str(i.attrs['href']).strip(),
'country': country
})
inst_index += 1
return {
'country': country,
'count': number_of_institutions,
'records': results
}
async def get_institutions(country, session):
try:
async with session.post(
url='https://www.whed.net/results_institutions.php',
data={"Chp1": country, "nbr_ref_pge": 10000}
) as response:
html = await response.read()
print(f"Successfully got {country}")
return extract_institutions(html, country)
except Exception as e:
print(f"Unable to get {country} due to {e.__class__}.")
async def fetch_all(countries):
async with aiohttp.ClientSession() as session:
return await asyncio.gather(*[get_institutions(country, session) for country in countries])
# Main call
main()
Old answer using synchronous algorithm
Improving on #Mithun's answer since it doesn't really work as it'll be stuck on the same page.
Also added direct access to the name and url to make it easier in case you want to access those.
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.select import Select
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
print("Init")
chrome_executable = Service(executable_path='chromedriver.exe', log_path='NUL')
chrome_options = Options()
chrome_options.add_argument("--headless")
driver = webdriver.Chrome(service=chrome_executable, options=chrome_options)
print("Opening Homepage")
url = "https://www.whed.net/results_institutions.php"
driver.get(url)
time.sleep(1)
print("Selecting country")
select = Select(driver.find_element(By.ID, "Chp1"))
country = "Albania"
select.select_by_visible_text(country)
time.sleep(.5)
print("Searching")
driver.find_element(By.XPATH, "//input[#value='Go']").click()
time.sleep(1)
print("Parsing")
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')
page = soup.find('p', {'class': 'infos'}).text
number_of_pages = str(page).split()[0]
counter = 10
results = []
while True:
raw = soup.find_all('a', {'class': 'fancybox fancybox.iframe'})
for i in raw:
results.append({
'name': str(i.text).strip(),
'url': 'https://www.whed.net/' + str(i.attrs['href']).strip(),
'country': country
})
print(f'{len(results)}/{number_of_pages}')
if counter >= int(number_of_pages):
break
counter += 10
driver.find_element(By.LINK_TEXT, "Next page").click()
time.sleep(0.5)
soup = BeautifulSoup(driver.page_source, 'html.parser')
driver.quit()
print(results)
You can use Selenium to scrape data. The following code will help you scrape the university names for "United States of America (all)". Similarly, you can scrape for other countries as well using Loop or entering the name manually. If you need the field of study for every university, you can scrape its href using bs4 and its field of study.
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.select import Select
driver = webdriver.Chrome(r"chromedriver.exe")
url = "https://www.whed.net/results_institutions.php"
driver.get(url)
time.sleep(1)
select = Select(driver.find_element(By.ID, "Chp1"))
select.select_by_visible_text("United States of America (all)")
time.sleep(1)
driver.find_element(By.XPATH, "//input[#value='Go']").click()
time.sleep(1)
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')
page = soup.find('p', {'class': 'infos'}).text
number_of_pages = str(page).split()[0]
counter = 10
while counter < int(number_of_pages):
raw = soup.find_all('div', {'class': 'details'})
for i in raw:
i = (str(i.text).lstrip())
i = i.replace("\n","")
i = i.replace("\r", "")
i = i.replace("\t", "")
print(i)
next_page = driver.find_element(By.LINK_TEXT, "Next page").click()
counter += 10
driver.quit()
I'm trying to scrap data from this website: https://www.aliexpress.com/wholesale?catId=0&initiative_id=AS_20220313071939&SearchText=bluetooth+earphones Especially I want to get all reviews from each product page. The main issue is that I'm struggling to get this surrounded bottom in order to scrape each comment and customer country:
Here is a photo showing that:
enter image description here
This is my code :
from selenium import webdriver
from lxml import html
import cssselect
from time import sleep
from itertools import zip_longest
import csv
driver = webdriver.Edge(executable_path=r"C:/Users/OUISSAL/Desktop/wscraping/XEW/scraping/codes/msedgedriver")
url = 'https://www.aliexpress.com/wholesale?trafficChannel=main&d=y&CatId=0&SearchText=bluetooth+earphones<ype=wholesale&SortType=default&page={}'
with open ("data.csv", "w", encoding="utf-8") as csvfile:
wr = csv.writer(csvfile)
wr.writerow(["Title","Price", "Currency", "Reviews", "Number of orders", "Shipping Cost", "Product links", "Country","Comments"])
for page_nb in range(1, 4):
print('---', page_nb, '---')
driver.get(url.format(page_nb))
sleep(2)
current_offset = 0
while True:
driver.execute_script("window.scrollBy(0, window.innerHeight);")
sleep(.5)
new_offset = driver.execute_script("return window.pageYOffset;")
print(new_offset,current_offset)
if new_offset <= current_offset:
break
current_offset = new_offset
sleep(3)
tree = html.fromstring(driver.page_source)
results = []
for product in tree.xpath('//div[#class="JIIxO"]//a'):
title = product.xpath('.//h1/text()')
if title:
title = title[0]
price = product.cssselect('div.mGXnE._37W_B span')
price = [x.text for x in price]
currency = price[0]
price = ''.join(price[1:])
review = product.xpath('.//span[#class="eXPaM"]/text()')
if review:
review = review[0]
else:
review = ''
nb_sold = product.xpath('.//span[#class="_1kNf9"]/text()')
if nb_sold:
nb_sold = nb_sold[0]
else:
nb_sold = ''
ship_cost = product.xpath('.//span[#class="_2jcMA"]/text()')
if ship_cost:
ship_cost = ship_cost[0]
else:
ship_cost = ''
###########################################
links = product.xpath('//div[#class="JIIxO"]//a/#href')
if links:
links = links[0]
else:
links = ''
# scraping data from each inner page
for link in links :
driver.get(link)
sleep(2)
current_offset = 0
while True:
driver.execute_script("window.scrollBy(0, window.innerHeight);")
sleep(.5)
new_offset = driver.execute_script("return window.pageYOffset;")
print(new_offset,current_offset)
if new_offset <= current_offset:
break
current_offset = new_offset
sleep(3)
tree = html.fromstring(driver.page_source)
for cmt in tree.xpath('//*[#id="transction-feedback"]/div[5]/div[1]'):
country = cmt.xpath('.//div[#class="user-country"]//b/text()')
if country:
country = country[0]
else:
country = ''
comment = cmt.xpath('.//span[#id="0.0.0.i4.5dc4sSFDsSFD5B"]/text()')
if comment:
comment = comment[0]
else:
comment = ''
row = [title, price, currency, review, nb_sold, ship_cost, links,country, comment]
results.append(row)
print('len(results):', len(results))
wr.writerows(results)
driver.close()
There are two problems:
First:
You have to use html.fromstring(driver.page_source) AFTER you scroll down.
Second:
It adds items only when they are displayed inside window (in viewport) so you can't jump directly to the end of page. You have to scroll partially (in loop) using i.e. window.innerHeight.
current_offset = 0
while True:
driver.execute_script("window.scrollBy(0, window.innerHeight);")
sleep(.5) # JavaScript has time to add elements
new_offset = driver.execute_script("return window.pageYOffset;")
#print(new_offset,current_offset)
if new_offset <= current_offset:
break
current_offset = new_offset
Full working code with other changes in xpath.
It gives me 60 items on every page.
from selenium import webdriver
from lxml import html
from time import sleep
from itertools import zip_longest
import csv
driver = webdriver.Edge(executable_path=r"C:/Users/OUISSAL/Desktop/wscraping/XEW/scraping/codes/msedgedriver")
#driver = webdriver.Firefox()
url = 'https://www.aliexpress.com/wholesale?trafficChannel=main&d=y&CatId=0&SearchText=bluetooth+earphones<ype=wholesale&SortType=default&page={}'
with open ("data.csv", "w", encoding="utf-8") as csvfile:
wr = csv.writer(csvfile)
wr.writerow(["Title","Price", "Currency", "Reviews", "Number of orders"])
for page_nb in range(1, 4):
print('---', page_nb, '---')
driver.get(url.format(page_nb))
sleep(2)
# jump to the end of page
#driver.execute_script('window.scrollTo(0, document.body.scrollHeight);')
# scroll partially
current_offset = 0
while True:
driver.execute_script("window.scrollBy(0, window.innerHeight);")
sleep(.5) # JavaScript has time to add elements
new_offset = driver.execute_script("return window.pageYOffset;")
print(new_offset,current_offset)
if new_offset <= current_offset:
break
current_offset = new_offset
sleep(3)
tree = html.fromstring(driver.page_source)
results = []
for product in tree.xpath('//div[#class="JIIxO"]//a'):
title = product.xpath('.//h1/text()')
#print('[DEBUG] title:', title)
if title:
title = title[0]
#print('[DEBUG] title:', title)
price = product.cssselect('div.mGXnE._37W_B span')
price = [x.text for x in price]
# for `$ 35.00`
currency = price[0]
price = ''.join(price[1:])
# for `35.00 zł`
#currency = price[-1]
#price = ''.join(price[:-1])
#print('[DEBUG] price:', price)
#print('[DEBUG] currency:', currency)
review = product.xpath('.//span[#class="eXPaM"]/text()')
if review:
review = review[0]
else:
review = ''
#print('[DEBUG] review:', review)
nb_sold = product.xpath('.//span[#class="_1kNf9"]/text()')
if nb_sold:
nb_sold = nb_sold[0]
else:
nb_sold = ''
#print('[DEBUG] nb_sold:', nb_sold)
row = [title, price, currency, review, nb_sold]
results.append(row)
#print('[DEBUG] row:', row)
print('len(results):', len(results))
wr.writerows(results)
driver.close()
I am working on a project that really blocked me I often asked questions here and you have helped me a lot since I am still a beginner, my project consists in making a competitive watch table for hotel rates for an agency It is a painful action that I wanted to automate it, I succeeded in extracting the tariffs and their prices, but the problem is that I want him to give me only the selected room
I provide you with the code and the output i removed the data that i want to elimnate in my output also i've addede images to better clarify things if any of you can help me and thank you in advance.
NB : thanks to pmadhu's answer problem solved but now it shows me the same rates for all hotels.
#!/usr/bin/env python
# coding: utf-8
import json
import time
from time import sleep
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait, Select
from selenium.common.exceptions import StaleElementReferenceException
# create path and start webdriver
PATH = "C:\chromedriver.exe"
driver = webdriver.Chrome(PATH)
# first get website
driver.get('https://tn.tunisiebooking.com/')
wait = WebDriverWait(driver, 20)
# params to select
params = {
'destination': 'Nabeul',
'date_from': '24/08/2021',
'date_to': '25/08/2021',
'bedroom': '1'
}
# select destination
destination_select = Select(driver.find_element_by_id('ville_des'))
destination_select.select_by_value(params['destination'])
# select bedroom
bedroom_select = Select(driver.find_element_by_id('select_ch'))
bedroom_select.select_by_value(params['bedroom'])
# select dates
script = f"document.getElementById('depart').value ='{params['date_from']}';"
script += f"document.getElementById('checkin').value ='{params['date_to']}';"
driver.execute_script(script)
# submit form
btn_rechercher = driver.find_element_by_id('boutonr')
btn_rechercher.click()
sleep(10)
# ----------------------------------------------------------------------------
from selenium.webdriver.support.ui import Select
from selenium.common.exceptions import StaleElementReferenceException,NoSuchElementException
urls = []
hotels = driver.find_elements_by_xpath("//div[starts-with(#id,'produit_affair')]")
for hotel in hotels:
link = hotel.find_element_by_xpath(".//span[#class='tittre_hotel']/a").get_attribute("href")
urls.append(link)
for url in urls:
driver.get(url)
try:
name = driver.find_element_by_xpath("//div[#class='bloc_titre_hotels']/h2").text
arropt = driver.find_element_by_xpath("//div[contains(#class,'line_result')][1]")
opt = arropt.find_element_by_tag_name("b").text
num = len(arropt.find_elements_by_tag_name("option"))
optiondata = {}
achats = {}
marges= {}
selection = Select(driver.find_element_by_id("arrangement"))
for i in range(num):
try:
selection = Select(driver.find_element_by_id("arrangement"))
selection.select_by_index(i)
time.sleep(2)
arr = driver.find_element_by_xpath("//select[#id='arrangement']/option[#selected='selected']").text
prize = driver.find_element_by_id("prix_total").text
optiondata[arr]=prize
btn_passe = driver.find_element_by_xpath('//*[#id="resultat"]/div/form/div/div[2]/div[1]/div[2]/div[2]/div/div ')
btn_passe.click()
sleep(2)
# params to select
params = {
'civilite_acheteur': 'Mlle',
'prenom_acheteur': 'test',
'nom_acheteur': 'test',
'e_mail_acheteur': 'test#gmail.com',
'portable_acheteur': '22222222'
}
# select civilite
civilite_acheteur = Select(driver.find_element_by_id('civilite_acheteur'))
civilite_acheteur.select_by_value(params['civilite_acheteur'])
# saisir prenom
script = f"document.getElementById('prenom_acheteur').value ='{params['prenom_acheteur']}';"
script += f"document.getElementById('nom_acheteur').value ='{params['nom_acheteur']}';"
script += f"document.getElementById('e_mail_acheteur').value ='{params['e_mail_acheteur']}';"
script += f"document.getElementById('portable_acheteur').value ='{params['portable_acheteur']}';"
driver.execute_script(script)
# submit form
btn_rechercher = driver.find_element_by_id('titre_Hammamet')
btn_rechercher.click()
sleep(2)
btn_rechercher = driver.find_element_by_id('boutonr')
btn_rechercher.click()
sleep(3)
achat = driver.find_element_by_xpath('/html/body/header/div[2]/div[1]/div[1]/div[4]/div[2]/div[2]').text.replace(' TND', '')
achats[arr]=achat
marge =int(((float(prize) - float(achat)) / float(achat)) * 100);
marges[arr]=marge
optiondata[arr]=prize,achat,marge
driver.get(url)
btn_passe = driver.find_element_by_xpath('//*[#id="moteur_rech"]/form/div/div[3]/div')
btn_passe.click()
sleep(2)
except StaleElementReferenceException:
pass
except NoSuchElementException:
pass
print("{} : {} - {}".format(name,opt,optiondata))
Try below code once:
from selenium.webdriver.support.ui import Select
from selenium.common.exceptions import StaleElementReferenceException,NoSuchElementException
hotels = driver.find_elements_by_xpath("//div[starts-with(#id,'produit_affair')]")
for hotel in hotels:
link = hotel.find_element_by_xpath(".//span[#class='tittre_hotel']/a").get_attribute("href")
urls.append(link)
for url in urls:
driver.get(url)
try:
name = driver.find_element_by_xpath("//div[#class='bloc_titre_hotels']/h2").text
arropt = driver.find_element_by_xpath("//div[contains(#class,'line_result')][1]")
opt = arropt.find_element_by_tag_name("b").text
num = len(arropt.find_elements_by_tag_name("option"))
optiondata = {}
selection = Select(driver.find_element_by_id("arrangement"))
for i in range(num):
try:
selection = Select(driver.find_element_by_id("arrangement"))
selection.select_by_index(i)
time.sleep(2)
arr = driver.find_element_by_xpath("//select[#id='arrangement']/option[#selected='selected']").text
prize = driver.find_element_by_id("prix_total").text
optiondata[arr]=prize
except StaleElementReferenceException:
pass
except NoSuchElementException:
pass
print("{} : {} - {} - {}".format(name,opt,num,optiondata))
And the output:
Tui Blue Scheherazade Sousse : Double Standard Vue Mer - 1 - {'Demi Pension': '114'}
Golf Residence GAS Sousse : Double--Standard - 2 - {'Demi Pension': '51', 'Petit Dejeuner': '42'}
Sindbad Center GAS Sousse : Chambre Double - 2 - {'Petit Dejeuner': '27', 'Logement seul': '22'}
How I can get the playlist urls stored like
here: https://www.youtube.com/watch?v=VpTRlS7EO6E&list=RDOIhVs0FQ8xc&index=5
with bs4?
Using
from bs4 import BeautifulSoup as bs
import requests
r = requests.get('https://www.youtube.com/watch?v=OIhVs0FQ8xc&list=RDOIhVs0FQ8xc&index=1')
page = r.text
soup=bs(page,'html.parser')
#print(soup)
res=soup.find_all('ytd-playlist-panel-video-renderer')
print(res)
doesn't return anything. Even printing the soup itself doesn't contain the link I'am looking for (like href="/watch?v=puNOG62lf-Y&list=RDOIhVs0FQ8xc&index=2")
It is a javascript rendered page. You have to use selenium.
from bs4 import BeautifulSoup as bs
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
import time
url = 'https://www.youtube.com/watch?v=OIhVs0FQ8xc&list=RDOIhVs0FQ8xc&index=1'
driver = webdriver.Chrome(ChromeDriverManager().install())
driver.set_window_size(1024, 600)
driver.maximize_window()
driver.get(url)
time.sleep(2)
soup=bs(driver.page_source,'html.parser')
res=soup.find_all('ytd-playlist-panel-video-renderer')
print(res)
Install the required package using pip install webdriver-manager
Thank you!
Here some dirty code working for me:
#---------------------------------
# import modules
from bs4 import BeautifulSoup as bs
from selenium import webdriver
import time
import re
#---------------------------------
#
from webdriver_manager.firefox import GeckoDriverManager
driver = webdriver.Firefox(executable_path=GeckoDriverManager().install())
#---------------------------------
# get links from url
def get_links(driver, sleep_time):
# open driver window
driver.set_window_size(1024, 600)
driver.maximize_window()
driver.get(url)
# wait some seconds
time.sleep(sleep_time)
# get information from url
soup = bs(driver.page_source,'html.parser')
res = soup.find_all('ytd-playlist-panel-video-renderer')
# check if there is information
if len(res) > 0:
main_url = 'https://www.youtube.com/watch?v='
urls = re.findall('watch.*list', str(res))
links = [main_url + str(a[8:-9]) for a in urls[::2]]
# if there is no information return false
else:
links = False
return links
#---------------------------------
# set sleep timer
sleep_time = 10
# call function to get links
links = get_links(driver, sleep_time)
This works for me:
from selenium import webdriver # pip install selenium
import time
# make sure you download chrome driver from https://chromedriver.chromium.org/downloads and put it in folder 'driver'
driver = webdriver.Chrome('driver\chromedriver.exe')
driver.get('https://www.youtube.com/playlist?list=PLxvodScTx2RtAOoajGSu6ad4p8P8uXKQk') # put here your link
# scroll page down
old_position = 0
new_position = None
position_script = """return (window.pageYOffset !== undefined) ?
window.pageYOffset : (document.documentElement ||
document.body.parentNode || document.body);"""
while new_position != old_position:
old_position = driver.execute_script(position_script)
time.sleep(1)
driver.execute_script(
"""var scrollingElement = (document.scrollingElement ||
document.body);scrollingElement.scrollTop =
scrollingElement.scrollHeight;""")
new_position = driver.execute_script(position_script)
source_page = driver.page_source
driver.quit()
# extract the url's and name's
counter = 1
element_to_find = 'amp;index={}" ar'
video_index = source_page.find(element_to_find.format(counter)) #'amp;index=1" ar'
while video_index != -1:
title_element = ''
count_name = video_index
while title_element != 'title="':
title_element = source_page[count_name: count_name + 7]
count_name += 1
count_name += 6
start_title_position = count_name
end_title = ''
while end_title != '>':
end_title = source_page[count_name] # exit loop if end_title == '>'
count_name += 1
name = source_page[start_title_position:count_name - 2] # extract the name of the video
name = name.replace('"','"')
video_id = source_page[video_index - 56: video_index - 45] # extract video id
print(str(counter)
+ '. link: ' + 'https://www.youtube.com/watch?v=' + video_id +
', name: ' + name)
counter += 1
video_index = source_page.find(element_to_find.format(counter)) # continue the next video
The easiest solution is:
from pytube import Playlist
URL_PLAYLIST = "https://www.youtube.com/playlist?list=YOUR-LINK"
# Retrieve URLs of videos from playlist
playlist = Playlist(URL_PLAYLIST)
print('Number Of Videos In playlist: %s' % len(playlist.video_urls))
urls = []
for url in playlist:
urls.append(url)
print(urls)
I'm trying to scrape this page
https://www.vivareal.com.br/venda/pernambuco/recife/#onde=BR-Pernambuco-NULL-Recife
I scraped the first page this website and click with selenium to next page, but I only can get the first page content, when I scrape the second, it came the same content from first page. I dunno how to fix this or if the webpage has some protection to scraping.
Could someone help me?
from bs4 import BeautifulSoup
import pandas as pd
from time import sleep
from time import time
from random import randint
from IPython.core.display import clear_output
from warnings import warn
from selenium import webdriver
def scrape():
cont = [True,True,True,True,False]
for times in cont:
if times != True:
driver = webdriver.Firefox(executable_path = 'geckodriver')
page = driver.get('https://www.vivareal.com.br/venda/pernambuco/recife/?#onde=BR-Pernambuco-NULL-Recife')
sleep(15)
titles = []
addresses = []
areas = []
rooms = []
bathes = []
values = []
start_time = time()
request = 0
soup = BeautifulSoup(page,'html.parser')
imov = soup.find_all('div', class_='property-card__main-content')
sleep(randint(8,15))
# Monitor
request += 1
elapsed_time = time() - start_time
print('Request: {}; Frequency: {} requests/s'.format(request, request/elapsed_time))
clear_output(wait = True)
# Throw a warning for non-200 status codes
if page.status_code != 200:
warn('Request: {}; Status code: {}'.format(requests, page.status_code))
# Break the loop if the number of requests is greater than expected
if request > 72:
warn('Number of requests was greater than expected.')
break
for container in imov:
# Título
title = container.h2.a.get_text()
t2 = title.strip()
titles.append(t2)
# Título
# Endereço
address = container.h2.span.get_text()
a2 = address.strip()
addresses.append(a2)
# Endereço
# Área
area = container.li.span.get_text()
ar2 = area.strip()
areas.append(ar2)
# Área
# Quartos
room = container.find(class_= "property-card__detail-item property-card__detail-room js-property-detail-rooms")
room2 = room.find('span', class_="property-card__detail-value js-property-card-value").get_text()
r2 = room2.strip()
rooms.append(r2)
# Quartos
# Banheiros
bath = container.find(class_= "property-card__detail-item property-card__detail-bathroom js-property-detail-bathroom")
bath2 = bath.find('span', class_="property-card__detail-value js-property-card-value").get_text()
b2 = bath2.strip()
bathes.append(b2)
# Banheiros
# Valor
value = container.section.div.get_text()
v2 = value.strip()
values.append(v2)
# Valor
# Dataframe e salvar
vivareal = pd.DataFrame({
"title": titles,
"address": addresses,
"area": areas,
"rooms":rooms,
"baths":bathes,
"value":values
})
vivareal.to_csv(r'output.csv')
prox = driver.find_element_by_xpath('//*[#title="Próxima página"]')
prox.click()
else:
print('Done!')
scrape()```
Although you put the click command at the end, when it goes to the next loop, the first command is to create a new driver and then is called the command to get the main page of Viva Real to Pernambuco. This is unwanted. Instead of this you could do:
def scrape():
cont = [True,True,True,True,False]
# You create the driver and access the main page only once
driver = webdriver.Firefox(executable_path = 'geckodriver')
page = driver.get('https://www.vivareal.com.br/venda/pernambuco/recife/?#onde=BR-Pernambuco-NULL-Recife')
for times in cont:
if times != True:
# Wait to load every page
sleep(15)
Your code is not working as espected, even with the fixes provided by #MarceloBaliu. Here is my code that (finally!) worked for me. I'm sharing because it can help someone, like I was helped by this website.
from selenium import webdriver
from selenium.common.exceptions import WebDriverException, ElementClickInterceptedException, NoSuchElementException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import re
import time
import pandas as pd
class ScraperVivaReal:
wait_time = 5
def __init__(self, url):
# Initializing the webdriver
options = webdriver.FirefoxOptions()
options.add_argument('-headless')
self.driver = webdriver.Firefox(options=options)
self.driver.maximize_window()
self.driver.get(url)
time.sleep(self.wait_time)
# Handling cookies acception
WebDriverWait(self.driver, self.wait_time).until(EC.element_to_be_clickable((By.XPATH,'//*[#id="cookie-notifier-cta"]'))).click()
time.sleep(self.wait_time/2)
def __scrape_page__(self):
result = []
# Extracting data from the page
try:
soup = BeautifulSoup(self.driver.page_source, 'html.parser')
except WebDriverException:
print('Webdriver was manually quit by the user!') # I configure this exception before adding the option -headless to webdriver
return result
# Finding property cards containing search results
div_list = soup.find_all('div', {'class':'property-card__content'})
# Iterating each card
for d in div_list:
# Extracting info from card
title = d.find('span', {'class': 'property-card__title js-cardLink js-card-title'}).get_text().strip()
complete_address = d.find('span', {'class': 'property-card__address'}).get_text().strip()
area = d.find('span', {'class': 'property-card__detail-value js-property-card-value property-card__detail-area js-property-card-detail-area'}).get_text().strip()
rooms = d.find('li', {'class': 'property-card__detail-item property-card__detail-room js-property-detail-rooms'}).find('span', {'class': 'property-card__detail-value js-property-card-value'}).get_text().strip()
baths = d.find('li', {'class': 'property-card__detail-item property-card__detail-bathroom js-property-detail-bathroom'}).find('span', {'class': 'property-card__detail-value js-property-card-value'}).get_text().strip()
garage = d.find('li', {'class': 'property-card__detail-item property-card__detail-garage js-property-detail-garages'}).find('span', {'class': 'property-card__detail-value js-property-card-value'}).get_text().strip()
# Extracting the price
try:
price = d.find('div', {'class':'property-card__price js-property-card-prices js-property-card__price-small'}).find('p').get_text().strip()
except AttributeError:
price = "N/I"
# Splitting the address
add_list = re.split(',|-', complete_address)
add_list = [ item.strip() for item in add_list ]
if len(add_list) == 2:
city, st = add_list
neibhood = 'N/I'
address = 'N/I'
number = 'N/I'
if len(add_list) == 3:
neibhood, city, st = add_list
address = 'N/I'
number = 'N/I'
if len(add_list) == 4:
address, neibhood, city, st = add_list
number = 'N/I'
elif len(add_list) == 5:
address, number, neibhood, city, st = add_list
# Adding the result into a dicionary and appending the dict to a result list
row = { 'Título': title, 'Endereço': address, 'Número': number, 'Bairro': neibhood, 'Cidade': city, 'Estado': st, 'Área': area, 'Quartos': rooms, 'Banheiros': baths, 'Vagas': garage, 'Preço': price }
result.append(row)
return result
def __next_page__(self):
# Finding the "Next Page" button element
next_element = self.driver.find_element_by_xpath('//*[#title="Próxima página"]')
try:
# Trying to click it
next_element.click()
time.sleep(self.wait_time)
return True
# Treating some exceptions (element not found and element not clickable)
except ElementClickInterceptedException:
print('"Próxima Página" element is not clickable!')
except NoSuchElementException:
print('"Próxima Página" element not found!')
return False
def run(self, output):
has_next = True
final_result = []
# Getting the information!
while has_next:
results = self.__scrape_page__()
final_result.extend(results)
print('Got {} results! Total Found: {}'.format(len(results), len(final_result)))
if len(results) == 0:
break
has_next = self.__next_page__()
# Quitting Firefox
self.driver.quit()
# Exporting results to CSV
df = pd.DataFrame(final_result)
df.to_csv(output, sep=',')
S = ScraperVivaReal('https://www.vivareal.com.br/venda/sp/paulinia/')
S.run('output.csv')