Selenium Web Scraping Opens Separate Browser - python

I am working on a project analyzing the Supercluster Astronaut Database. I posted a StackOverflow question a few weeks ago about scraping the data from the website and got the code below from one of the helpful posters.
My only remaining issue with this process is that when I load the code, a browser window pops open linked to the data source I am trying to scrape. I've tried tinkering with this code to get the browser window to not pop up by commenting out a few lines here and there, but nothing I've tried seems to work properly. Can someone help point me in the right direction to modify the code below to not have a browser pop up?
from bs4 import BeautifulSoup
import pandas as pd
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
import time
data = []
url = 'https://www.supercluster.com/astronauts?ascending=false&limit=300&list=true&sort=launch%20order'
driver = webdriver.Chrome(ChromeDriverManager().install())
driver.maximize_window()
time.sleep(5)
driver.get(url)
time.sleep(5)
soup = BeautifulSoup(driver.page_source, 'lxml')
driver.close()
tags = soup.select('.astronaut_cell.x')
for item in tags:
name = item.select_one('.bau.astronaut_cell__title.bold.mr05').get_text()
#print(name.text)
country = item.select_one('.mouseover__contents.rel.py05.px075.bau.caps.small.ac')
if country:
country=country.get_text()
#print(country)
data.append([name, country])
cols=['name','country']
df = pd.DataFrame(data,columns=cols)
print(df)

I think you're looking to run your code in headless mode. You can add a headless argument in the Options() class to achieve this.
Code :-
from bs4 import BeautifulSoup
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
import time
data = []
url = 'https://www.supercluster.com/astronauts?ascending=false&limit=300&list=true&sort=launch%20order'
options = Options()
options.add_argument("--headless")
driver = webdriver.Chrome(ChromeDriverManager().install(),options=options)
driver.maximize_window()
driver.get(url)
time.sleep(10)
soup = BeautifulSoup(driver.page_source, 'lxml')
driver.close()
tags = soup.select('.astronaut_cell.x')
for item in tags:
name = item.select_one('.bau.astronaut_cell__title.bold.mr05').get_text()
#print(name.text)
country = item.select_one('.mouseover__contents.rel.py05.px075.bau.caps.small.ac')
if country:
country=country.get_text()
#print(country)
data.append([name, country])
cols=['name','country']
df = pd.DataFrame(data,columns=cols)
print(df)

Related

Selenium not able to find all elements in HTML page

I am doing web scraping to the real estate portal <www.immobiliare.it>
Specifically I am retrieving some information from the search page, which contains 25 properties per page. I have managed to retrieved almost everything but I am having trouble to retrieve the src of a map image that each property has. This map is after a CSS selector.
The HTML structure is the following:
I have been able to get this data with selenium:
https://stackoverflow.com/a/75020969/14461986
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
Options = Options()
Options.headless = True
driver = webdriver.Chrome(options=Options, service=Service(ChromeDriverManager().install()))
url = 'https://www.immobiliare.it/vendita-case/milano/forlanini/?criterio=dataModifica&ordine=desc&page=3'
driver.get(url)
soup = BeautifulSoup(driver.page_source)
data = []
# Each property is contained under each li in-realEstateResults__item
for property in soup.select('li.in-realEstateResults__item'):
data.append({
'id': property.get('id'),
'MapUrl': property.select_one('[alt="mappa"]').get('src') if property.select_one('[alt="mappa"]') else None
})
print(data)
However, after the 4th image the MapUrl comes empty. The properties are correcty loaded as I have checked the Ids and also the HTML for the rest of the images is the same but for a reason I do not understand the MapUrl is not retrieved. I would also welcome any advice on how make this script more efficient.
However, issue here is lazy loading, so you have to interact with the website and scroll down to force the loading.
You may have to accept / close some popups (optional):
driver.find_element(By.CSS_SELECTOR,'#didomi-notice-agree-button').click()
driver.find_element(By.CSS_SELECTOR,'.nd-dialogFrame__close').click()
driver.find_element(By.CSS_SELECTOR,'section h1').click()
now we can start scrolling (simple but working solution, could be improved):
for i in range(30):
driver.find_element(By.CSS_SELECTOR,'body').send_keys(Keys.PAGE_DOWN)
time.sleep(0.3)
Example
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
url = 'https://www.immobiliare.it/vendita-case/milano/forlanini/?criterio=dataModifica&ordine=desc'
driver.get(url)
driver.find_element(By.CSS_SELECTOR,'#didomi-notice-agree-button').click()
driver.find_element(By.CSS_SELECTOR,'.nd-dialogFrame__close').click()
driver.find_element(By.CSS_SELECTOR,'section h1').click()
for i in range(30):
driver.find_element(By.CSS_SELECTOR,'body').send_keys(Keys.PAGE_DOWN)
time.sleep(0.3)
soup = BeautifulSoup(driver.page_source)
data = []
for e in soup.select('li.in-realEstateResults__item'):
data.append({
'title':e.a.get('title'),
'imgUrls':[i.get('src') for i in e.select('.nd-list__item img')],
'imgMapInfo': e.select_one('[alt="mappa"]').get('src') if e.select_one('[alt="mappa"]') else None
})
data

WebScraping Aliexpress - Lazyloading

I am trying to web scrape Aliexpress using Selenium and Python. I'm doing it by following a youtube tutorial, I have followed every steps but I just can't seem to get it to work.
I tried to use requests, BeautifulSoup as well. But it seems like Aliexpress uses lazy loaders on their product listings. I tried using the window scroll script but that didn't work. It seems like the content would not load until I personally scroll on it.
This is the url for the page I would like to web scrape
https://www.aliexpress.com/wholesale?trafficChannel=main&d=y&CatId=0&SearchText=dog+supplies&ltype=wholesale&SortType=default&g=n
This is the code I have currently. It doesn't return anything in the output. I think that's because it's trying to go through all the product listings but it couldn't find any because it's not loaded...
Any suggestions/help would be greatly appreciated, sorry for the bad formatting and the bad code in advance.
Thank you!
"""
To do
HOT PRODUCT FINDER Enter: Keyword, to generate a url
Product Name
Product Image
Product Link
Sales Number
Price
Create an excel file that contains these data
Sort the list by top selling orders
Develop an algorithm for the velocity of the product (total sales increased / time?)
Scrape site every day """
import csv
from bs4 import BeautifulSoup
from selenium import webdriver
import time
import requests
import lxml
#Starting Up the web driver
driver = webdriver.Chrome()
# grab Keywords
search_term = input('Keywords: ')
# url generator
def get_url(search_term):
"""Generate a url link using search term provided"""
url_template = 'https://www.aliexpress.com/wholesale?trafficChannel=main&d=y&CatId=0&SearchText={}&ltype=wholesale&SortType=default&g=n'
search_term = search_term.replace(" ", "+")
return url_template.format(search_term)
url = get_url('search_term')
driver.get(url)
#scrolling down to the end of the page
time.sleep(2)
driver.execute_script('window.scrollTo(0, document.body.scrollHeight);')
#Extracting the Collection
r = requests.get(url)
soup = BeautifulSoup(r.content,'lxml')
productlist = soup.find_all('div', class_='list product-card')
print(productlist)
import csv
from bs4 import BeautifulSoup
from selenium import webdriver
import time
import requests
import lxml
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("start-maximized")
chrome_options.add_argument("disable-infobars")
chrome_options.add_argument("--disable-extensions")
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument('--disable-blink-features=AutomationControlled')
driver = webdriver.Chrome(executable_path = 'chromedriver.exe',options = chrome_options)
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.keys import Keys
# grab Keywords
search_term = input('Keywords: ')
# url generator
driver.get('https://www.aliexpress.com')
driver.implicitly_wait(10)
p = driver.find_element_by_name('SearchText')
p.send_keys(search_term)
p.send_keys(Keys.ENTER)
productlist = []
product = driver.find_element_by_xpath('//*[#id="root"]/div/div/div[2]/div[2]/div/div[2]/ul')
height = driver.execute_script("return document.body.scrollHeight")
for scrol in range(100,height-1800,100):
driver.execute_script(f"window.scrollTo(0,{scrol})")
time.sleep(0.5)
# driver.execute_script('window.scrollTo(0, document.body.scrollHeight);')
div = []
list_i = []
item_title = []
a = []
for z in range(1,16):
div.append(product.find_element_by_xpath('//*[#id="root"]/div/div/div[2]/div[2]/div/div[2]/ul/div'+str([z])))
for pr in div:
list_i.append(pr.find_elements_by_class_name('list-item'))
for pc in list_i:
for p in pc:
item_title.append(p.find_element_by_class_name('item-title-wrap'))
for pt in item_title:
a.append(pt.find_element_by_tag_name('a'))
for prt in a:
productlist.append(prt.text)

The problem in crawl the every next page using beautifulsoup and webdriver

I am trying to crawl all the links of jobs from https://www.vietnamworks.com/tim-viec-lam/tat-ca-viec-lam by using BeautifulSoup and Selenium. The problem is that I just only can crawl the links of the 1st page and don't know how to crawl the link from every next page.
This is the code I have tried:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support import expected_conditions as EC
import time
import requests
from bs4 import BeautifulSoup
import array as arr
import pandas as pd
#The first line import the Web Driver, and the second import Chrome Options
#-----------------------------------#
#Chrome Options
all_link = []
chrome_options = Options()
chrome_options.add_argument ('--ignore-certificate-errors')
chrome_options.add_argument ("--igcognito")
chrome_options.add_argument ("--window-size=1920x1080")
chrome_options.add_argument ('--headless')
#-----------------------------------#
driver = webdriver.Chrome(chrome_options=chrome_options, executable_path="C:/webdriver/chromedriver.exe")
#Open url
url = "https://www.vietnamworks.com/tim-viec-lam/tat-ca-viec-lam"
driver.get(url)
time.sleep(2)
#-----------------------------------#
page_source = driver.page_source
page = page_source
soup = BeautifulSoup(page_source,"html.parser")
block_job_list = soup.find_all("div",{"class":"d-flex justify-content-center align-items-center logo-area-wrapper logo-border"})
for i in block_job_list:
link = i.find("a")
all_link.append("https://www.vietnamworks.com/"+ link.get("href"))
Since your problem is traversing through the pages, this code will help you do that. Insert the scraping code inside the while loop as mentioned.
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
import time
from webdriver_manager.chrome import ChromeDriverManager # use pip install webdriver_manager if not installed
option = webdriver.ChromeOptions()
CDM = ChromeDriverManager()
driver = webdriver.Chrome(CDM.install(),options=option)
url = 'https://www.vietnamworks.com/tim-viec-lam/tat-ca-viec-lam'
driver.get(url)
time.sleep(3)
page_num = 1
links = []
driver.execute_script("window.scrollTo(0, document.body.scrollHeight/2);")
while True:
# create the soup element here so that it can get the page source of every page
# sample scraping of url's of the jobs posted
for i in driver.find_elements_by_class_name('job-title '):
links.append(i.get_attribute('href'))
# moves to next page
try:
print(f'On page {str(page_num)}')
print()
page_num+=1
driver.find_element_by_link_text(str(page_num)).click()
time.sleep(3)
# checks only at the end of the page
except NoSuchElementException:
print('End of pages')
break
driver.quit()
EDIT:
Simplified and modified the pagination method
If you are using BeautifulSoup then you have to insert the page_source and soup variables inside the while loop because after every iteration, the source page code changes. In your code you had extracted only the first page's source code and hence you got repetitive outputs which is equal to the number of pages.
By using ChromeDriverManager in the package webdriver-manager, the need to mention the location/executable path is not needed. You can just copy paste this code and run it in any machine that has Chrome installed in it. If you have to installed use pip install webdriver_manager in cmd before running the code.
Warning: AVOID DISPLAYING your actual username and password of any of your accounts like you have in your GitHub code.

Can't find Element with Selenium

Im trying to scrap the price of a flight from the Google Flights website using Selenium but said element does not show up anywhere, not even when scraping the whole page. Ive read that it might be due to it being in a different frame, but how would I know in which frame it is.
Here is the website: https://www.google.es/flights?lite=0#flt=/m/0h3tv./m/05qtj.2018-12-14;c:EUR;e:1;a:FR;sd:1;t:f;tt:o
The price I'm looking for is: 32 €
And here is my code:
from bs4 import BeautifulSoup as soup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
chrome_options = Options()
chrome_options.add_argument("--headless")
d = webdriver.Chrome('/Users/davidgarciaballester/Desktop/chromedriver', options=chrome_options)
url='https://www.google.es/flights?lite=0#flt=/m/0h3tv./m/05qtj.2018-12-14;c:EUR;e:1;a:FR;sd:1;t:f;tt:o'
d.get(url)
precios = soup(d.page_source, 'html.parser').findAll('jsl',{'jstcache':'9322'})
print(precios)
d.quit();
Am I missing something? Thanks in advance.
EDIT 1: jstcache changed value to 9322
You can use the following CSS selector combination:
from selenium import webdriver
d = webdriver.Chrome()
d.get("https://www.google.es/flights?lite=0#flt=/m/0h3tv./m/05qtj.2018-12-14;c:EUR;e:1;a:FR;sd:1;t:f;tt:o")
item = d.execute_script("return document.querySelector('.flt-subhead1.gws-flights-results__price.gws-flights-results__cheapest-price span + jsl')")
print(item.text)
d.quit()
from bs4 import BeautifulSoup as soup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
d = webdriver.Chrome('C:\chromedriver_win32\chromedriver.exe')
url='https://www.google.es/flights?lite=0#flt=/m/0h3tv./m/05qtj.2018-12-14;c:EUR;e:1;a:FR;sd:1;t:f;tt:o'
d.get(url)
page = soup(d.page_source, 'html.parser')
precios = page.findAll('jsl',{'jstcache':'9322'})
print(precios)
d.quit();
worked for me:
print (precios[0].text)
gave me €32
Ok figured out what was going on. I wasn't giving the driver enough time to load the page. Fixed this by stalling for a few seconds after loading the page.
Working code:
from bs4 import BeautifulSoup as soup
from selenium import webdriver
import time
from selenium.webdriver.chrome.options import Options
d = webdriver.Chrome('C:/Users/David/Desktop/chromedriver.exe')
url='https://www.google.es/flights?lite=0#flt=/m/0h3tv./m/05qtj.2018-12-14;c:EUR;e:1;a:FR;sd:1;t:f;tt:o'
d.get(url)
time.sleep(5)
page = soup(d.page_source, 'html.parser')
precios = page.findAll('jsl',{'jstcache':'9322'})
print(precios)
d.quit()
EDIT 1: As Idlehands pointed out the jstcache number is probably dynamic and changes over time, so this aproach was not well thought. Instead I'm now using the following CSS selector combination QHarr suggested. Working code:
import time
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
whitelist = set('abcdefghijklmnopqrstuvwxyz ABCDEFGHIJKLMNOPQRSTUVWXYZ 0123456789')
chrome_options = Options()
chrome_options.add_argument("--headless")
d = webdriver.Chrome('C:/Users/David/Desktop/chromedriver.exe', options=chrome_options)
url='https://www.google.es/flights?lite=0#flt=/m/0h3tv./m/05qtj.2018-12-14;c:EUR;e:1;a:FR;sd:1;t:f;tt:o'
d.get(url)
time.sleep(2)
precio = d.execute_script("return document.querySelector('.flt-subhead1.gws-flights-results__price.gws-flights-results__cheapest-price span + jsl')").text
precio = ''.join(filter(whitelist.__contains__, precio))
print(precio)
d.quit()

web scraping w/age verification

Hello I want to web scrape data from a site with an age verification pop-up using python 3.x and beautifulsoup. I can't get to the underlying text and images without clicking "yes" for "are you over 21". Thanks for any support.
EDIT: Thanks, with some help from a comment I see that I can use the cookies but am not sure how to manage/store/call cookies with the requests package.
So with some help from another user I am using selenium package so that it will work also in case it's a graphical overlay (I think?). Having trouble getting it to work with the gecko driver but will keep trying! Thanks for all the advice again, everyone.
EDIT 3: OK I have made progress and I can get the browser window to open, using the gecko driver!~ Unfortunately it doesn't like that link specification so I'm posting again. The link to click "yes" on the age verification is buried on that page as something called mlink...
EDIT 4: Made some progress, updated code is below. I managed to find the element in the XML code, now I just need to manage to click the link.
#
import time
import selenium
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from bs4 import BeautifulSoup
driver = webdriver.Firefox(executable_path=r'/Users/jeff/Documents/geckodriver') # Optional argument, if not specified will search path.
driver.get('https://www.shopharborside.com/oakland/#/shop/412');
url = 'https://www.shopharborside.com/oakland/#/shop/412'
driver.get(url)
#
driver.find_element_by_class_name('hhc_modal-body').click(Yes)
#wait.1.second
time.sleep(1)
pagesource = driver.page_source
soup = BeautifulSoup(pagesource)
#you.can.now.enjoy.soup
print(soup.prettify())
Edit new: Stuck again, here is the current code. I seem to have isolated the element "mBtnYes" but I get an error when running the code :
ElementClickInterceptedException: Message: Element is not clickable at point (625,278.5500030517578) because another element obscures it
import time
import selenium
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from bs4 import BeautifulSoup
driver = webdriver.Firefox(executable_path=r'/Users/jeff/Documents/geckodriver') # Optional argument, if not specified will search path.
driver.get('https://www.shopharborside.com/oakland/#/shop/412');
url = 'https://www.shopharborside.com/oakland/#/shop/412'
driver.get(url)
#
driver.find_element_by_id('myBtnYes').click()
#wait.1.second
time.sleep(1)
pagesource = driver.page_source
soup = BeautifulSoup(pagesource)
#you.can.now.enjoy.soup
print(soup.prettify())
if your aim is to click the verification get to selenium:
ps install selenium && get geckodriver(firefox) or chromedriver(chrome)
#Mossein~King(hi i'm here to help)
import time
import selenium
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.firefox.options import Options
from BeautifulSoup import BeautifulSoup
#this.is.for.headless.This.will.save.you.a.bunch.of.research.time(Trust.me)
options = Options()
options.add_argument("--headless")
driver = webdriver.Firefox(firefox_options=options)
#for.graphical(you.need.gecko.driver.for.firefox)
# driver = webdriver.Firefox()
url = 'your-url'
driver.get(url)
#get.the.link.to.clicking
#exaple if<a class='MosseinKing'>
driver.find_element_by_xpath("//a[#class='MosseinKing']").click()
#wait.1.secong.in.case.of.transitions
time.sleep(1)
pagesource = driver.page_source
soup = BeautifulSoup(pagesource)
#you.can.now.enjoy.soup
print soup.prettify()

Categories