WebScraping Aliexpress - Lazyloading - python

I am trying to web scrape Aliexpress using Selenium and Python. I'm doing it by following a youtube tutorial, I have followed every steps but I just can't seem to get it to work.
I tried to use requests, BeautifulSoup as well. But it seems like Aliexpress uses lazy loaders on their product listings. I tried using the window scroll script but that didn't work. It seems like the content would not load until I personally scroll on it.
This is the url for the page I would like to web scrape
https://www.aliexpress.com/wholesale?trafficChannel=main&d=y&CatId=0&SearchText=dog+supplies&ltype=wholesale&SortType=default&g=n
This is the code I have currently. It doesn't return anything in the output. I think that's because it's trying to go through all the product listings but it couldn't find any because it's not loaded...
Any suggestions/help would be greatly appreciated, sorry for the bad formatting and the bad code in advance.
Thank you!
"""
To do
HOT PRODUCT FINDER Enter: Keyword, to generate a url
Product Name
Product Image
Product Link
Sales Number
Price
Create an excel file that contains these data
Sort the list by top selling orders
Develop an algorithm for the velocity of the product (total sales increased / time?)
Scrape site every day """
import csv
from bs4 import BeautifulSoup
from selenium import webdriver
import time
import requests
import lxml
#Starting Up the web driver
driver = webdriver.Chrome()
# grab Keywords
search_term = input('Keywords: ')
# url generator
def get_url(search_term):
"""Generate a url link using search term provided"""
url_template = 'https://www.aliexpress.com/wholesale?trafficChannel=main&d=y&CatId=0&SearchText={}&ltype=wholesale&SortType=default&g=n'
search_term = search_term.replace(" ", "+")
return url_template.format(search_term)
url = get_url('search_term')
driver.get(url)
#scrolling down to the end of the page
time.sleep(2)
driver.execute_script('window.scrollTo(0, document.body.scrollHeight);')
#Extracting the Collection
r = requests.get(url)
soup = BeautifulSoup(r.content,'lxml')
productlist = soup.find_all('div', class_='list product-card')
print(productlist)

import csv
from bs4 import BeautifulSoup
from selenium import webdriver
import time
import requests
import lxml
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("start-maximized")
chrome_options.add_argument("disable-infobars")
chrome_options.add_argument("--disable-extensions")
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument('--disable-blink-features=AutomationControlled')
driver = webdriver.Chrome(executable_path = 'chromedriver.exe',options = chrome_options)
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.keys import Keys
# grab Keywords
search_term = input('Keywords: ')
# url generator
driver.get('https://www.aliexpress.com')
driver.implicitly_wait(10)
p = driver.find_element_by_name('SearchText')
p.send_keys(search_term)
p.send_keys(Keys.ENTER)
productlist = []
product = driver.find_element_by_xpath('//*[#id="root"]/div/div/div[2]/div[2]/div/div[2]/ul')
height = driver.execute_script("return document.body.scrollHeight")
for scrol in range(100,height-1800,100):
driver.execute_script(f"window.scrollTo(0,{scrol})")
time.sleep(0.5)
# driver.execute_script('window.scrollTo(0, document.body.scrollHeight);')
div = []
list_i = []
item_title = []
a = []
for z in range(1,16):
div.append(product.find_element_by_xpath('//*[#id="root"]/div/div/div[2]/div[2]/div/div[2]/ul/div'+str([z])))
for pr in div:
list_i.append(pr.find_elements_by_class_name('list-item'))
for pc in list_i:
for p in pc:
item_title.append(p.find_element_by_class_name('item-title-wrap'))
for pt in item_title:
a.append(pt.find_element_by_tag_name('a'))
for prt in a:
productlist.append(prt.text)

Related

scrape a specific div value with beautifulsoup in nested div

I currently try scrape a value at this specific website for a school project https://data.census.gov/cedsci/table?q=53706%20income&tid=ACSST5Y2020.S1901
it's the first one below if you search Median income (dollars), which should be the median income of the area, the comp-id keep changing for some reason
This median income estimate is what I'm looking for
I tried serveral method on the sites to go over the nested divs but I'm not able to get any results after runned, below is a code that I tried to use, but it just kept returning nothing to me.
Any help will be appreciate, thanks!
import csv
import requests
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import pandas as pd
from bs4 import BeautifulSoup
DRIVER_PATH = 'chromedriver_107.exe'
driver = webdriver.Chrome(executable_path=DRIVER_PATH)
url = 'https://data.census.gov/cedsci/table?q=' + '53706' + '%20income&tid=ACSST5Y2020.S1901'
driver.get(url)
page = requests.get(url)
content = driver.page_source
soup = BeautifulSoup(content, 'lxml')
a = soup.findAll("div", {"comp-id":"1539"})
print(a)
Try with this:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
#set up Chrome driver
options=webdriver.ChromeOptions()
#Define web driver as a Chrome driver and navigate
driver = webdriver.Chrome()
driver.maximize_window()
url = 'https://data.census.gov/cedsci/table?q=53703%20income&tid=ACSST5Y2020.S1901'
driver.get(url)
# We print the label of row 11 (Which is the median)
label = WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.XPATH, "(//div[#row-id='11'])[1]")))
print(label.text)
# We print the values of row 11 (Which is the median)
values = WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.XPATH, "(//div[#row-id='11'])[2]")))
print(values.text)
Output:
Median income (dollars)
42,153
±3,200
114,643
±28,572
139,694

How to collect all hrefs using xpath? Selenium - Python

I'm trying to collect all (5) of the social media links from the artist in this example. Currently, my output is only the LAST (fifth) social media link. I'm using selenium, I understand this my not be the best option for collecting this data but its all I know at this time.
Note, I've only included relevant code for my question. Thank you in advance for any help/insight.
from cgitb import text
from os import link
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
import time
from random import randint
import pandas as pd
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('disable-infobars')
chrome_options.add_argument('--disable-extensions')
chrome_options.add_experimental_option('excludeSwitches', ['enable-automation'])
driver = webdriver.Chrome(chrome_options=chrome_options)
for url in urls:
driver.get(https://soundcloud.com/flux-pavilion)
time.sleep(randint(3,4))
try:
links = driver.find_elements_by_xpath('//*[#id="content"]/div/div[4]/div[2]/div/article[1]/div[2]/ul/li//a[#href]')
for elem in links:
socialmedia = (elem.get_attribute("href"))
except:
links = "none"
artist = {
'socialmedia': socialmedia,
}
print(artist)
The problem is not with your XPath-expression, but rather with the (non-existent) list processing of your output code.
Your code output'ed only the last item of the resulting XPath list. That was the problem why you only received one link (it was the last one).
So change the output part of your code to
[...]
url = driver.get("https://soundcloud.com/flux-pavilion")
time.sleep(randint(3,4))
artist = []
try:
links = driver.find_elements_by_xpath('//*[#id="content"]/div/div[4]/div[2]/div/article[1]/div[2]/ul/li//a[#href]')
for elem in links:
artist.append(elem.get_attribute("href"))
except:
links = "none"
for link in artist:
print(link)
And the output will contain all of the values(links) you desire:
driver = webdriver.Chrome(chrome_options=chrome_options)
https://gate.sc/?url=https%3A%2F%2Ftwitter.com%2FFluxpavilion&token=da4a8d-1-1653430570528
https://gate.sc/?url=https%3A%2F%2Finstagram.com%2FFluxpavilion&token=277ea0-1-1653430570529
https://gate.sc/?url=https%3A%2F%2Ffacebook.com%2FFluxpavilion&token=4c773c-1-1653430570530
https://gate.sc/?url=https%3A%2F%2Fyoutube.com%2FFluxpavilion&token=1353f7-1-1653430570531
https://gate.sc/?url=https%3A%2F%2Fopen.spotify.com%2Fartist%2F7muzHifhMdnfN1xncRLOqk%3Fsi%3DbK9XeoW5RxyMlA-W9uVwPw&token=bc2936-1-1653430570532

Selenium Web Scraping Opens Separate Browser

I am working on a project analyzing the Supercluster Astronaut Database. I posted a StackOverflow question a few weeks ago about scraping the data from the website and got the code below from one of the helpful posters.
My only remaining issue with this process is that when I load the code, a browser window pops open linked to the data source I am trying to scrape. I've tried tinkering with this code to get the browser window to not pop up by commenting out a few lines here and there, but nothing I've tried seems to work properly. Can someone help point me in the right direction to modify the code below to not have a browser pop up?
from bs4 import BeautifulSoup
import pandas as pd
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
import time
data = []
url = 'https://www.supercluster.com/astronauts?ascending=false&limit=300&list=true&sort=launch%20order'
driver = webdriver.Chrome(ChromeDriverManager().install())
driver.maximize_window()
time.sleep(5)
driver.get(url)
time.sleep(5)
soup = BeautifulSoup(driver.page_source, 'lxml')
driver.close()
tags = soup.select('.astronaut_cell.x')
for item in tags:
name = item.select_one('.bau.astronaut_cell__title.bold.mr05').get_text()
#print(name.text)
country = item.select_one('.mouseover__contents.rel.py05.px075.bau.caps.small.ac')
if country:
country=country.get_text()
#print(country)
data.append([name, country])
cols=['name','country']
df = pd.DataFrame(data,columns=cols)
print(df)
I think you're looking to run your code in headless mode. You can add a headless argument in the Options() class to achieve this.
Code :-
from bs4 import BeautifulSoup
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
import time
data = []
url = 'https://www.supercluster.com/astronauts?ascending=false&limit=300&list=true&sort=launch%20order'
options = Options()
options.add_argument("--headless")
driver = webdriver.Chrome(ChromeDriverManager().install(),options=options)
driver.maximize_window()
driver.get(url)
time.sleep(10)
soup = BeautifulSoup(driver.page_source, 'lxml')
driver.close()
tags = soup.select('.astronaut_cell.x')
for item in tags:
name = item.select_one('.bau.astronaut_cell__title.bold.mr05').get_text()
#print(name.text)
country = item.select_one('.mouseover__contents.rel.py05.px075.bau.caps.small.ac')
if country:
country=country.get_text()
#print(country)
data.append([name, country])
cols=['name','country']
df = pd.DataFrame(data,columns=cols)
print(df)

How to scrape data after scrolling the page down, a page which only loads 10 items and then when we scroll down it will add the items

https://www.fynd.com/brands/
I am trying to scrape the data from this page and get all the data in the title div tag, but there are many title tags as you scroll down. Initially, when we load the page it will only show a few brands and then when we manually scroll down it will keep on adding brands. The below code is what I am using :
from bs4 import BeautifulSoup
from selenium import webdriver
import requests
import pandas as pd
import time
from selenium.common.exceptions import ElementClickInterceptedException
url = "https://www.fynd.com/brands/"
driver = webdriver.Chrome(executable_path ="D:\\chromedriver_win32\chromedriver.exe")
driver.get(url)
driver.execute_script("window.scrollTo(0,document.body.scrollHeight)")
soup = BeautifulSoup(driver.page_source,"html.parser")
title = soup.find_all('span', class_="ukt-title clrWhite")
all_titles = list()
for jelly in title:
all_titles.append(jelly.text.strip())
print(all_titles)
Screenshot of the titles which I want
Try the below code :
It's an infinite while loop for list of webelements. Make sure to have the same indentation :
driver.get("https://www.fynd.com/brands/")
while True:
for item in driver.find_elements(By.XPATH, "//div[#data-cardtype='BRANDS']"):
ActionChains(driver).move_to_element(item).perform()
sleep(0.1)
print(item.text)
You can optimize this script by removing 0.1 , I just put to have a visual experience.
its a little bit different but working
from selenium import webdriver
import chromedriver_autoinstaller
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
import time
from selenium.webdriver.common.action_chains import ActionChains
chromedriver_autoinstaller.install()
driver = webdriver.Chrome()
i=0
f = open('yeshivaLinks.txt','w')
driver.get("https://www.yeshiva.org.il/ask/filter")
print (len(driver.find_elements(By.XPATH, '//*[#id="myLessonsScroller"]/ul/content-preview/a')))
for a in driver.find_elements(By.XPATH, '//*[#id="myLessonsScroller"]/ul/content-preview/a'):
print(a.get_attribute('href'))
while True:
for a in driver.find_elements(By.XPATH, '//*[#id="myLessonsScroller"]/ul/content-preview/a'):
ActionChains(driver).move_to_element(a).perform()
print(a.get_attribute('href'))
f.write(a.get_attribute('href')+'\n')
i= i+1
if(i == (len(driver.find_elements(By.XPATH, '//*[#id="myLessonsScroller"]/ul/content-preview/a')[i:])-15)):
for i in range(10):
lastHeight = driver.execute_script("return document.body.scrollHeight")
print(lastHeight)
driver.execute_script('window.scrollTo(0, document.body.scrollHeight-50);')
time.sleep(1)
I think you should try this.
driver.get("https://www.fynd.com/brands/")
while True:
for item in driver.find_elements(By.XPATH, "//div[#data-cardtype='BRANDS']"):
ActionChains(driver).move_to_element(item).perform()
sleep(0.5)
print(item.text)

While using beautifulsoup4 and selenium, output a None value if a an element is inexstant on the page

Good time of the day,
Currently I work on scraping project with the end goal is to create a DataFrame.
While I navigate from page to page, I have to gather different criterias. Though In case if the criteria is not present on the page, I would like to receive a "None"
import re
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
import time
import random
from bs4 import BeautifulSoup
start_time = time.time()
url='https://www.immoweb.be/en/search/house/for-sale?countries=BE&page=1&orderBy=relevance'
driver = webdriver.Chrome()
driver.implicitly_wait(30)
driver.get(url)
time.sleep(random.uniform(1.0, 3.0))
python_button = driver.find_elements_by_xpath('//*[#id="uc-btn-accept-banner"]')[0]
python_button.click()
time.sleep(random.uniform(1.0, 3.0))
python_button = driver.find_elements_by_xpath('//*[#id="classified_9312278"]')[0]
python_button.click()
soup = BeautifulSoup(driver.page_source)
area = list()
for i in range(15):
python_button = driver.find_elements_by_xpath('//*[#id="classifiedNavigation"]/ul/li[2]/a')[0]
python_button.click()
time.sleep(random.uniform(1.0, 3.0))
soup = BeautifulSoup(driver.page_source)
try:
for table in soup.findAll("th",text=re.compile("Living area")):
if table:
area.append(table.find_next("td").next_element.strip())
else:
area.append(None)
except:
area.append(None)
houses = {"Area":area}
print(houses)
However with the current code, only exisiting value appends to the list - whatever is not added does not even leave a blank.
And here is a link to the search
Thank you very much in advance!
It is pretty much obvious to me now
if soup.findAll("th",text=re.compile("Living area")):
for table in soup.findAll("th",text=re.compile("Living area")):
area.append(table.find_next("td").next_element.strip())
else:
area.append(None)

Categories