Iterating through multiple urls - python

With your help, I was able to get a scraper running, but now I am stuck when it comes to iterating. Ultimately, I want to scraper to run through different URLs, but I'm getting confused in the syntax. I am using Selenium to open the web page and then BeautifulSoup to extract the data. I think I need to define the URLs and then use something like:
for url in urls
but I am not sure how to use this. Reading other answers and videos has left me scratching my head.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import csv
import time
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup as soup
import pandas as pd
urls = ["https://racing.hkjc.com/racing/information/English/Racing/LocalResults.aspx?RaceDate=2021/02/06&Racecourse=ST&RaceNo=1","https://racing.hkjc.com/racing/information/English/Racing/LocalResults.aspx?RaceDate=2021/02/06&Racecourse=ST&RaceNo=2"]
driver = webdriver.Chrome()
driver.get(urls)
for url in urls:
html = driver.execute_script("return document.getElementsByTagName('html')[0].innerHTML")
WebDriverWait(driver, 20).until(EC.visibility_of_all_elements_located((By.CLASS_NAME, "f_fs13")))
htmlStr = driver.page_source
soup_level1 = soup(htmlStr, 'html.parser')
race_soup = soup_level1.find('tbody',{'class':'f_fs13'}).find_parent('table')
results_soup = soup_level1.find('tbody',{'class':'f_fs12'}).find_parent('table')
df1 = pd.read_html(str(race_soup))[0]
print(df1)
df2 = pd.read_html(str(results_soup))[0]
print(df2)
print('good')
driver.close()

Related

Retrieving specific matches from a list in python

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver import ActionChains
from selenium.webdriver.common.by import By
from time import sleep
from datetime import datetime
import pandas as pd
import warnings
import os
os.chdir('C:/Users/paulc/Documents/Medium Football')
warnings.filterwarnings('ignore')
base_url = 'https://www.sportingindex.com/spread-betting/football/international-world-cup'
option = Options()
option.headless = False
driver = webdriver.Chrome("C:/Users/paulc/Documents/Medium Football/chromedriver.exe",options=option)
driver.get(base_url)
links = [elem.get_attribute("href") for elem in driver.find_elements(By.TAG_NAME,"a")]
this code retrieves all the href links on this page. I want to search the links list and return only the matches that contain 'https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a'
however I get the AttributeError: 'NoneType' object has no attribute 'startswith'
using
import re
[x for x in links if x.startswith('https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a')]
help is appreciated.
Instead of collecting all a elements on the page where will be a lot of irrelevant results you can use more precise locator.
So, instead of
driver.find_elements(By.TAG_NAME,"a")
Use this:
driver.find_elements(By.XPATH,"//a[contains(#href,'https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a')]")
This will give you desired elements only.
And this
links = [elem.get_attribute("href") for elem in driver.find_elements(By.XPATH,"//a[contains(#href,'https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a')]")]
will directly give you the wanted links only.
UPD
In case this is giving you an empty list you possibly are missing a delay. So, you can simply add some pause before that line, like time.sleep(2) but it's better to use WebDriverWait expected_conditions explicit waits for that.
I can't check it since my computer is blocking that link due to my company policy since that is a gambling site, but normally something like this should work:
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
wait = WebDriverWait(driver, 10)
links = [elem.get_attribute("href") for elem in wait.until(EC.visibility_of_all_elements_located((By.XPATH, "//a[contains(#href,'https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a')]")))]
The following code is filtering to grab the right links
import time
from bs4 import BeautifulSoup
import pandas as pd
from selenium.webdriver.chrome.service import Service
from selenium import webdriver
webdriver_service = Service("./chromedriver") #Your chromedriver path
driver = webdriver.Chrome(service=webdriver_service)
driver.get('https://www.sportingindex.com/spread-betting/football/international-world-cup')
driver.maximize_window()
time.sleep(8)
soup = BeautifulSoup(driver.page_source,"lxml")
for u in soup.select('a[class="gatracking"]'):
link = 'https://www.sportingindex.com' + u.get('href')
if '-v-' in link:
print(link)
Output:
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.24fdf8f5-b69b-4341-b6b4-d27605f7f7fd/spain-v-germany
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.c9bdf787-791a-47e0-b77c-a2d4cf567bfd/cameroon-v-serbia
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.5eddaa44-666b-47dc-8a0f-4ac758de00dc/south-korea-v-ghana
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.70cefd39-60f7-415e-9cb5-7a56acd403d6/brazil-v-switzerland
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.7fe0285e-366f-4f3c-b77f-4c96077a6c71/portugal-v-uruguay
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.dd7a995d-7478-45f8-af27-9f234d37cc76/ecuador-v-senegal
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.92232207-0f1e-4bb1-bacd-1332ef6b9007/netherlands-v-qatar
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.b913620e-69c7-4606-a153-7b48589b7c94/iran-v-usa
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.7a4a18fb-d4ee-4880-849f-f1afdea33cd5/wales-v-england
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.20c098b4-4e97-4fd1-97b0-f42d84424361/australia-v-denmark
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.5a7476e2-8d35-4a8e-8065-b4339e79f395/tunisia-v-france
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.8a869f02-9dd0-49c5-91bd-209ee224fc2a/poland-v-argentina
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.6379b787-f246-4ba4-a896-28a97396d02f/saudi-arabia-v-mexico
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.52737cfd-da19-42dd-b15b-c16c3e8e9a86/canada-v-morocco
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.168fab1f-8360-4e87-ba84-bfbd11a4a207/croatia-v-belgium
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.9fb541f0-43a4-409c-8e54-e34a43965714/costa-rica-v-germany
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.7379c8a7-ab5d-4653-b487-22bf7ff8eefe/japan-v-spain
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.e7e4c6be-98b7-4258-ba40-74c54a790fe1/ghana-v-uruguay
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.e4c18c81-565e-47ce-b08d-9aed62c88a5d/south-korea-v-portugal
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.18f44028-e23d-48d4-970b-e75c164589bd/cameroon-v-brazil
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.526f9b1b-6d95-4f44-abce-e0a6a30acfd4/serbia-v-switzerland
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.24fdf8f5-b69b-4341-b6b4-d27605f7f7fd/spain-v-germany
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.c9bdf787-791a-47e0-b77c-a2d4cf567bfd/cameroon-v-serbia
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.5eddaa44-666b-47dc-8a0f-4ac758de00dc/south-korea-v-ghana
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.70cefd39-60f7-415e-9cb5-7a56acd403d6/brazil-v-switzerland
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.7fe0285e-366f-4f3c-b77f-4c96077a6c71/portugal-v-uruguay
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.24fdf8f5-b69b-4341-b6b4-d27605f7f7fd/spain-v-germany
https://www.sportingindex.com/spread-betting/rugby-union/france-top-14/group_a.ad22f34f-9cd6-47b4-a826-0c0f0dce7df2/lyon-v-toulouse
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.24fdf8f5-b69b-4341-b6b4-d27605f7f7fd/spain-v-germany
https://www.sportingindex.com/spread-betting/rugby-union/france-top-14/group_a.ad22f34f-9cd6-47b4-a826-0c0f0dce7df2/lyon-v-toulouse
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.c9bdf787-791a-47e0-b77c-a2d4cf567bfd/cameroon-v-serbia
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.5eddaa44-666b-47dc-8a0f-4ac758de00dc/south-korea-v-ghana
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.70cefd39-60f7-415e-9cb5-7a56acd403d6/brazil-v-switzerland
https://www.sportingindex.com/spread-betting/football/international-world-cup/group_a.7fe0285e-366f-4f3c-b77f-4c96077a6c71/portugal-v-uruguay

Get all the product's href from AliExpress using selenium

I want to get and print all the Urls of the products in one of the AliExpress pages, it worked but just the first 12 urls that show and not all the urls on the page, normally it should be 60 urls because every page in AliExpress contains 60 products. How can I extract all the 60 urls. Any help is highly appreciated.
This is my code :
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from time import sleep
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
driver.get("https://www.aliexpress.com/wholesale?trafficChannel=main&d=y&CatId=0&SearchText=smart+lock&ltype=wholesale&SortType=total_tranpro_desc")
urls = [a.get_attribute('href') for a in driver.find_elements(By.CSS_SELECTOR,'a[href*="/item/"]')]
for url in urls:
print(url)
The web page there not presenting all the products. It uses specific loading strategy to present only currently visible products.
To grab all the products you will have to scroll the page collecting the presented products during the scrolling. The URLs should be stored in set to avoid duplicates.
I tried the following simple solution and it worked for me:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
import time
options = Options()
options.add_argument("start-maximized")
webdriver_service = Service('C:\webdrivers\chromedriver.exe')
driver = webdriver.Chrome(service=webdriver_service, options=options)
url = "https://www.aliexpress.com/wholesale?trafficChannel=main&d=y&CatId=0&SearchText=smart+lock&ltype=wholesale&SortType=total_tranpro_desc"
driver.get(url)
urls = set()
for i in range(5):
block = [a.get_attribute('href') for a in driver.find_elements(By.CSS_SELECTOR, 'a[href*="/item/"]')]
urls.update(block)
driver.execute_script("window.scrollBy(0, arguments[0]);", 600)
time.sleep(2)
for url in urls:
print(url)
The output is:
https://he.aliexpress.com/item/1005004505011076.html?algo_pvid=cf9a5cc9-df53-404b-a8ed-db8a227069e6&algo_exp_id=cf9a5cc9-df53-404b-a8ed-db8a227069e6-33&pdp_ext_f=%7B%22sku_id%22%3A%2212000029397314035%22%7D&pdp_npi=2%40dis%21ILS%21680.47%21238.16%21%21%21%21%21%400b0a0ac216625452616837499e4d53%2112000029397314035%21sea&curPageLogUid=eOPom12i1YdL
https://he.aliexpress.com/item/1005003118129589.html?algo_pvid=cf9a5cc9-df53-404b-a8ed-db8a227069e6&algo_exp_id=cf9a5cc9-df53-404b-a8ed-db8a227069e6-3&pdp_ext_f=%7B%22sku_id%22%3A%2212000024195825216%22%7D&pdp_npi=2%40dis%21ILS%2117.75%2114.38%21%21%210.0%21%21%400b0a0ac216625452616837499e4d53%2112000024195825216%21sea&curPageLogUid=b4KiLx28AEal
https://he.aliexpress.com/item/1005004314575646.html?algo_pvid=cf9a5cc9-df53-404b-a8ed-db8a227069e6&algo_exp_id=cf9a5cc9-df53-404b-a8ed-db8a227069e6-27&pdp_ext_f=%7B%22sku_id%22%3A%2212000028723076890%22%7D&pdp_npi=2%40dis%21ILS%21626.53%21307.0%21%21%21%21%21%400b0a0ac216625452616837499e4d53%2112000028723076890%21sea&curPageLogUid=iOoGBaB9fk1i
https://he.aliexpress.com/item/4000154297870.html?algo_pvid=cf9a5cc9-df53-404b-a8ed-db8a227069e6&algo_exp_id=cf9a5cc9-df53-404b-a8ed-db8a227069e6-4&pdp_ext_f=%7B%22sku_id%22%3A%2212000017734780587%22%7D&pdp_npi=2%40dis%21ILS%2176.86%2139.18%21%21%217.4%21%21%400b0a0ac216625452616837499e4d53%2112000017734780587%21sea&curPageLogUid=iMbrWCxxlNRK
https://he.aliexpress.com/item/1005002868522660.html?algo_pvid=cf9a5cc9-df53-404b-a8ed-db8a227069e6&algo_exp_id=cf9a5cc9-df53-404b-a8ed-db8a227069e6-28&pdp_ext_f=%7B%22sku_id%22%3A%2212000022534558777%22%7D&pdp_npi=2%40dis%21ILS%2163.46%2138.69%21%21%2121.15%21%21%400b0a0ac216625452616837499e4d53%2112000022534558777%21sea&curPageLogUid=kzYKVU82QxVZ
https://he.aliexpress.com/item/1005004495398206.html?algo_pvid=cf9a5cc9-df53-404b-a8ed-db8a227069e6&algo_exp_id=cf9a5cc9-df53-404b-a8ed-db8a227069e6-44&pdp_ext_f=%7B%22sku_id%22%3A%2212000029361338239%22%7D&pdp_npi=2%40dis%21ILS%21303.25%21239.56%21%21%21%21%21%400b0a0ac216625452616837499e4d53%2112000029361338239%21sea&curPageLogUid=ZCNQVDFjiW24
https://he.aliexpress.com/item/1005002843575207.html?algo_pvid=cf9a5cc9-df53-404b-a8ed-db8a227069e6&algo_exp_id=cf9a5cc9-df53-404b-a8ed-db8a227069e6-30&pdp_ext_f=%7B%22sku_id%22%3A%2212000026607998519%22%7D&pdp_npi=2%40dis%21ILS%21294.41%21232.57%21%21%21%21%21%400b0a0ac216625452616837499e4d53%2112000026607998519%21sea&curPageLogUid=dKGPq84ciu6A
https://he.aliexpress.com/item/4001166182060.html?algo_pvid=cf9a5cc9-df53-404b-a8ed-db8a227069e6&algo_exp_id=cf9a5cc9-df53-404b-a8ed-db8a227069e6-14&pdp_ext_f=%7B%22sku_id%22%3A%2210000014983376216%22%7D&pdp_npi=2%40dis%21ILS%2183.98%2167.17%21%21%213.51%21%21%400b0a0ac216625452616837499e4d53%2110000014983376216%21sea&curPageLogUid=5P64jj4BsxMb
https://he.aliexpress.com/item/1005002864846136.html?algo_pvid=cf9a5cc9-df53-404b-a8ed-db8a227069e6&algo_exp_id=cf9a5cc9-df53-404b-a8ed-db8a227069e6-31&pdp_ext_f=%7B%22sku_id%22%3A%2212000027311675115%22%7D&pdp_npi=2%40dis%21ILS%21321.74%21254.18%21%21%2130.24%21%21%400b0a0ac216625452616837499e4d53%2112000027311675115%21sea&curPageLogUid=wIquG44FDWvS
https://he.aliexpress.com/item/1005004407267255.html?algo_pvid=cf9a5cc9-df53-404b-a8ed-db8a227069e6&algo_exp_id=cf9a5cc9-df53-404b-a8ed-db8a227069e6-25&pdp_ext_f=%7B%22sku_id%22%3A%2212000029260939933%22%7D&pdp_npi=2%40dis%21ILS%211117.23%21893.79%21%21%21%21%21%400b0a0ac216625452616837499e4d53%2112000029260939933%21sea&curPageLogUid=L9QVMI7Dkc9L
https://he.aliexpress.com/item/1005003202983418.html?algo_pvid=cf9a5cc9-df53-404b-a8ed-db8a227069e6&algo_exp_id=cf9a5cc9-df53-404b-a8ed-db8a227069e6-13&pdp_ext_f=%7B%22sku_id%22%3A%2212000024642176626%22%7D&pdp_npi=2%40dis%21ILS%2176.75%2127.75%21%21%21%21%21%400b0a0ac216625452616837499e4d53%2112000024642176626%21sea&curPageLogUid=w934YlWjZpSk
https://he.aliexpress.com/item/10000345027263.html?algo_pvid=cf9a5cc9-df53-404b-a8ed-db8a227069e6&algo_exp_id=cf9a5cc9-df53-404b-a8ed-db8a227069e6-40&pdp_ext_f=%7B%22sku_id%22%3A%2220000000178547076%22%7D&pdp_npi=2%40dis%21ILS%2173.66%2147.88%21%21%216.31%21%21%400b0a0ac216625452616837499e4d53%2120000000178547076%21sea&curPageLogUid=1CXW2eZmYyoz
https://he.aliexpress.com/item/1005003782715703.html?algo_pvid=cf9a5cc9-df53-404b-a8ed-db8a227069e6&algo_exp_id=cf9a5cc9-df53-404b-a8ed-db8a227069e6-41&pdp_ext_f=%7B%22sku_id%22%3A%2212000028520788363%22%7D&pdp_npi=2%40dis%21ILS%21733.13%21403.22%21%21%21%21%21%400b0a0ac216625452616837499e4d53%2112000028520788363%21sea&curPageLogUid=DzR0GUWN6DqQ
https://he.aliexpress.com/item/1005003093642821.html?algo_pvid=cf9a5cc9-df53-404b-a8ed-db8a227069e6&algo_exp_id=cf9a5cc9-df53-404b-a8ed-db8a227069e6-11&pdp_ext_f=%7B%22sku_id%22%3A%2212000025801875061%22%7D&pdp_npi=2%40dis%21ILS%21823.63%21271.78%21%21%21%21%21%400b0a0ac216625452616837499e4d53%2112000025801875061%21sea&curPageLogUid=yVO5hwgoHmV1
https://he.aliexpress.com/item/4000918150378.html?algo_pvid=cf9a5cc9-df53-404b-a8ed-db8a227069e6&algo_exp_id=cf9a5cc9-df53-404b-a8ed-db8a227069e6-23&pdp_ext_f=%7B%22sku_id%22%3A%2212000021385410206%22%7D&pdp_npi=2%40dis%21ILS%21178.69%21146.52%21%21%2186.4%21%21%400b0a0ac216625452616837499e4d53%2112000021385410206%21sea&curPageLogUid=C6dkvTMogJ62
https://he.aliexpress.com/item/1005001900549701.html?algo_pvid=cf9a5cc9-df53-404b-a8ed-db8a227069e6&algo_exp_id=cf9a5cc9-df53-404b-a8ed-db8a227069e6-16&pdp_ext_f=%7B%22sku_id%22%3A%2212000024274381461%22%7D&pdp_npi=2%40dis%21ILS%21188.58%21105.62%21%21%21%21%21%400b0a0ac216625452616837499e4d53%2112000024274381461%21sea&curPageLogUid=d1dAHgBFOsqF
https://he.aliexpress.com/item/1005003257440167.html?algo_pvid=cf9a5cc9-df53-404b-a8ed-db8a227069e6&algo_exp_id=cf9a5cc9-df53-404b-a8ed-db8a227069e6-29&pdp_ext_f=%7B%22sku_id%22%3A%2212000029883679176%22%7D&pdp_npi=2%40dis%21ILS%21891.4%21410.06%21%21%21%21%21%400b0a0ac216625452616837499e4d53%2112000029883679176%21sea&curPageLogUid=D3UOiF7Eswwj
https://he.aliexpress.com/item/33023975249.html?algo_pvid=cf9a5cc9-df53-404b-a8ed-db8a227069e6&algo_exp_id=cf9a5cc9-df53-404b-a8ed-db8a227069e6-19&pdp_ext_f=%7B%22sku_id%22%3A%2212000026754718473%22%7D&pdp_npi=2%40dis%21ILS%21132.1%2147.57%21%21%2112.0%21%21%400b0a0ac216625452616837499e4d53%2112000026754718473%21sea&curPageLogUid=iVmMqv3Hv2dw
https://he.aliexpress.com/item/1005004105323148.html?algo_pvid=cf9a5cc9-df53-404b-a8ed-db8a227069e6&algo_exp_id=cf9a5cc9-df53-404b-a8ed-db8a227069e6-35&pdp_ext_f=%7B%22sku_id%22%3A%2212000028159854219%22%7D&pdp_npi=2%40dis%21ILS%21497.87%21388.34%21%21%21%21%21%400b0a0ac216625452616837499e4d53%2112000028159854219%21sea&curPageLogUid=ScGffoZSAzHu
https://he.aliexpress.com/item/1005003076385676.html?algo_pvid=cf9a5cc9-df53-404b-a8ed-db8a227069e6&algo_exp_id=cf9a5cc9-df53-404b-a8ed-db8a227069e6-38&pdp_ext_f=%7B%22sku_id%22%3A%2212000023907005619%22%7D&pdp_npi=2%40dis%21ILS%2186.71%2124.94%21%21%21%21%21%400b0a0ac216625452616837499e4d53%2112000023907005619%21sea&curPageLogUid=QArTgWyf3MS7
https://he.aliexpress.com/item/1005003731084162.html?algo_pvid=cf9a5cc9-df53-404b-a8ed-db8a227069e6&algo_exp_id=cf9a5cc9-df53-404b-a8ed-db8a227069e6-2&pdp_ext_f=%7B%22sku_id%22%3A%2212000029315337088%22%7D&pdp_npi=2%40dis%21ILS%21743.72%21327.24%21%21%21%21%21%400b0a0ac216625452616837499e4d53%2112000029315337088%21sea&curPageLogUid=aMsEw0Z36uSN
https://he.aliexpress.com/item/4001010240435.html?algo_pvid=cf9a5cc9-df53-404b-a8ed-db8a227069e6&algo_exp_id=cf9a5cc9-df53-404b-a8ed-db8a227069e6-43&pdp_ext_f=%7B%22sku_id%22%3A%2210000013511866229%22%7D&pdp_npi=2%40dis%21ILS%2129.36%213.02%21%21%21%21%21%400b0a0ac216625452616837499e4d53%2110000013511866229%21sea&curPageLogUid=z1vD7tD9W5MZ
https://he.aliexpress.com/item/4000154478484.html?algo_pvid=cf9a5cc9-df53-404b-a8ed-db8a227069e6&algo_exp_id=cf9a5cc9-df53-404b-a8ed-db8a227069e6-8&pdp_ext_f=%7B%22sku_id%22%3A%2212000021350825852%22%7D&pdp_npi=2%40dis%21ILS%2178.64%2135.39%21%21%217.4%21%21%400b0a0ac216625452616837499e4d53%2112000021350825852%21sea&curPageLogUid=yRWy6gHc4b1E
https://he.aliexpress.com/item/1005003346086758.html?algo_pvid=cf9a5cc9-df53-404b-a8ed-db8a227069e6&algo_exp_id=cf9a5cc9-df53-404b-a8ed-db8a227069e6-32&pdp_ext_f=%7B%22sku_id%22%3A%2212000025331814334%22%7D&pdp_npi=2%40dis%21ILS%2195.31%2158.12%21%21%21%21%21%400b0a0ac216625452616837499e4d53%2112000025331814334%21sea&curPageLogUid=iGvJbXPDD3cg
https://he.aliexpress.com/item/4000065467682.html?algo_pvid=cf9a5cc9-df53-404b-a8ed-db8a227069e6&algo_exp_id=cf9a5cc9-df53-404b-a8ed-db8a227069e6-6&pdp_ext_f=%7B%22sku_id%22%3A%2210000015061976915%22%7D&pdp_npi=2%40dis%21ILS%21281.64%21191.53%21%21%21162.48%21%21%400b0a0ac216625452616837499e4d53%2110000015061976915%21sea&curPageLogUid=1qiD9mBYf6KM
https://he.aliexpress.com/item/1005002676337744.html?algo_pvid=cf9a5cc9-df53-404b-a8ed-db8a227069e6&algo_exp_id=cf9a5cc9-df53-404b-a8ed-db8a227069e6-1&pdp_ext_f=%7B%22sku_id%22%3A%2212000021669614390%22%7D&pdp_npi=2%40dis%21ILS%21561.28%21392.91%21%21%21%21%21%400b0a0ac216625452616837499e4d53%2112000021669614390%21sea&curPageLogUid=Y1Mvi9ey1Hwl
https://he.aliexpress.com/item/4000834334480.html?algo_pvid=cf9a5cc9-df53-404b-a8ed-db8a227069e6&algo_exp_id=cf9a5cc9-df53-404b-a8ed-db8a227069e6-34&pdp_ext_f=%7B%22sku_id%22%3A%2212000028610539908%22%7D&pdp_npi=2%40dis%21ILS%21162.83%21138.42%21%21%2168.96%21%21%400b0a0ac216625452616837499e4d53%2112000028610539908%21sea&curPageLogUid=6CijCm642BQh
https://he.aliexpress.com/item/4001031327478.html?algo_pvid=cf9a5cc9-df53-404b-a8ed-db8a227069e6&algo_exp_id=cf9a5cc9-df53-404b-a8ed-db8a227069e6-7&pdp_ext_f=%7B%22sku_id%22%3A%2212000015965032899%22%7D&pdp_npi=2%40dis%21ILS%211037.26%21383.79%21%21%21%21%21%400b0a0ac216625452616837499e4d53%2112000015965032899%21sea&curPageLogUid=fLjjoL5HUrxi
https://he.aliexpress.com/item/1005004262179786.html?algo_pvid=cf9a5cc9-df53-404b-a8ed-db8a227069e6&algo_exp_id=cf9a5cc9-df53-404b-a8ed-db8a227069e6-12&pdp_ext_f=%7B%22sku_id%22%3A%2212000028573420101%22%7D&pdp_npi=2%40dis%21ILS%21679.22%21319.24%21%21%21%21%21%400b0a0ac216625452616837499e4d53%2112000028573420101%21sea&curPageLogUid=n4io2MIKAEsP
https://he.aliexpress.com/item/1005003515525914.html?algo_pvid=cf9a5cc9-df53-404b-a8ed-db8a227069e6&algo_exp_id=cf9a5cc9-df53-404b-a8ed-db8a227069e6-0&pdp_ext_f=%7B%22sku_id%22%3A%2212000026134836061%22%7D&pdp_npi=2%40dis%21ILS%21493.37%21360.15%21%21%21%21%21%400b0a0ac216625452616837499e4d53%2112000026134836061%21sea&curPageLogUid=qY8tyv2zwcdK
https://he.aliexpress.com/item/1005003230965691.html?algo_pvid=cf9a5cc9-df53-404b-a8ed-db8a227069e6&algo_exp_id=cf9a5cc9-df53-404b-a8ed-db8a227069e6-26&pdp_ext_f=%7B%22sku_id%22%3A%2212000024764885745%22%7D&pdp_npi=2%40dis%21ILS%21188.3%21133.68%21%21%21%21%21%400b0a0ac216625452616837499e4d53%2112000024764885745%21sea&curPageLogUid=CJHjTgOCDRJ3
https://he.aliexpress.com/item/4000975407287.html?algo_pvid=cf9a5cc9-df53-404b-a8ed-db8a227069e6&algo_exp_id=cf9a5cc9-df53-404b-a8ed-db8a227069e6-10&pdp_ext_f=%7B%22sku_id%22%3A%2210000015021478939%22%7D&pdp_npi=2%40dis%21ILS%21680.62%21299.47%21%21%21%21%21%400b0a0ac216625452616837499e4d53%2110000015021478939%21sea&curPageLogUid=WDdJT8cOU94O
https://he.aliexpress.com/item/1005003187314155.html?algo_pvid=cf9a5cc9-df53-404b-a8ed-db8a227069e6&algo_exp_id=cf9a5cc9-df53-404b-a8ed-db8a227069e6-42&pdp_ext_f=%7B%22sku_id%22%3A%2212000024567051659%22%7D&pdp_npi=2%40dis%21ILS%21157.54%2194.54%21%21%21%21%21%400b0a0ac216625452616837499e4d53%2112000024567051659%21sea&curPageLogUid=4qQvpAeygSFw
https://he.aliexpress.com/item/32954893736.html?algo_pvid=cf9a5cc9-df53-404b-a8ed-db8a227069e6&algo_exp_id=cf9a5cc9-df53-404b-a8ed-db8a227069e6-39&pdp_ext_f=%7B%22sku_id%22%3A%2210000002371991125%22%7D&pdp_npi=2%40dis%21ILS%21303.58%21100.18%21%21%21%21%21%400b0a0ac216625452616837499e4d53%2110000002371991125%21sea&curPageLogUid=yDxMTsGMA1m1
https://he.aliexpress.com/item/1005002548126691.html?algo_pvid=cf9a5cc9-df53-404b-a8ed-db8a227069e6&algo_exp_id=cf9a5cc9-df53-404b-a8ed-db8a227069e6-15&pdp_ext_f=%7B%22sku_id%22%3A%2212000021089170892%22%7D&pdp_npi=2%40dis%21ILS%2171.49%2118.2%21%21%21%21%21%400b0a0ac216625452616837499e4d53%2112000021089170892%21sea&curPageLogUid=xfzmTqNWeUEV
https://he.aliexpress.com/item/4000735115535.html?algo_pvid=cf9a5cc9-df53-404b-a8ed-db8a227069e6&algo_exp_id=cf9a5cc9-df53-404b-a8ed-db8a227069e6-17&pdp_ext_f=%7B%22sku_id%22%3A%2212000027051383811%22%7D&pdp_npi=2%40dis%21ILS%21223.41%21176.51%21%21%2128.2%21%21%400b0a0ac216625452616837499e4d53%2112000027051383811%21sea&curPageLogUid=FasdXFlpMCuE
https://he.aliexpress.com/item/1005002659060326.html?algo_pvid=cf9a5cc9-df53-404b-a8ed-db8a227069e6&algo_exp_id=cf9a5cc9-df53-404b-a8ed-db8a227069e6-20&pdp_ext_f=%7B%22sku_id%22%3A%2212000021594037009%22%7D&pdp_npi=2%40dis%21ILS%211144.03%21800.83%21%21%21%21%21%400b0a0ac216625452616837499e4d53%2112000021594037009%21sea&curPageLogUid=qWxA5NkkHX7u
https://he.aliexpress.com/item/1005003050882550.html?algo_pvid=cf9a5cc9-df53-404b-a8ed-db8a227069e6&algo_exp_id=cf9a5cc9-df53-404b-a8ed-db8a227069e6-21&pdp_ext_f=%7B%22sku_id%22%3A%2212000023509149328%22%7D&pdp_npi=2%40dis%21ILS%2178.01%2135.5%21%21%21%21%21%400b0a0ac216625452616837499e4d53%2112000023509149328%21sea&curPageLogUid=eazJyQ2xYlTa
https://he.aliexpress.com/item/1005001429138105.html?algo_pvid=cf9a5cc9-df53-404b-a8ed-db8a227069e6&algo_exp_id=cf9a5cc9-df53-404b-a8ed-db8a227069e6-5&pdp_ext_f=%7B%22sku_id%22%3A%2212000016106374836%22%7D&pdp_npi=2%40dis%21ILS%21908.29%21272.49%21%21%21%21%21%400b0a0ac216625452616837499e4d53%2112000016106374836%21sea&curPageLogUid=UTqfAcwKjxt3
https://he.aliexpress.com/item/1005003575980819.html?algo_pvid=cf9a5cc9-df53-404b-a8ed-db8a227069e6&algo_exp_id=cf9a5cc9-df53-404b-a8ed-db8a227069e6-9&pdp_ext_f=%7B%22sku_id%22%3A%2212000026346773711%22%7D&pdp_npi=2%40dis%21ILS%2156.48%2137.29%21%21%21%21%21%400b0a0ac216625452616837499e4d53%2112000026346773711%21sea&curPageLogUid=t6N9K9ie8MII
https://he.aliexpress.com/item/1005004406234563.html?algo_pvid=cf9a5cc9-df53-404b-a8ed-db8a227069e6&algo_exp_id=cf9a5cc9-df53-404b-a8ed-db8a227069e6-24&pdp_ext_f=%7B%22sku_id%22%3A%2212000029068345270%22%7D&pdp_npi=2%40dis%21ILS%21334.62%21167.31%21%21%2119.93%21%21%400b0a0ac216625452616837499e4d53%2112000029068345270%21sea&curPageLogUid=ERz9XojcigBs
https://he.aliexpress.com/item/1005003208283080.html?algo_pvid=cf9a5cc9-df53-404b-a8ed-db8a227069e6&algo_exp_id=cf9a5cc9-df53-404b-a8ed-db8a227069e6-18&pdp_ext_f=%7B%22sku_id%22%3A%2212000024667248681%22%7D&pdp_npi=2%40dis%21ILS%21164.38%21100.25%21%21%21%21%21%400b0a0ac216625452616837499e4d53%2112000024667248681%21sea&curPageLogUid=Z5qyCqcRmY48
https://he.aliexpress.com/item/1005002983524272.html?algo_pvid=cf9a5cc9-df53-404b-a8ed-db8a227069e6&algo_exp_id=cf9a5cc9-df53-404b-a8ed-db8a227069e6-37&pdp_ext_f=%7B%22sku_id%22%3A%2212000023060725911%22%7D&pdp_npi=2%40dis%21ILS%21164.45%21146.35%21%21%21%21%21%400b0a0ac216625452616837499e4d53%2112000023060725911%21sea&curPageLogUid=LPz7kSEHNr3P
https://he.aliexpress.com/item/32964505971.html?algo_pvid=cf9a5cc9-df53-404b-a8ed-db8a227069e6&algo_exp_id=cf9a5cc9-df53-404b-a8ed-db8a227069e6-22&pdp_ext_f=%7B%22sku_id%22%3A%2212000020850078768%22%7D&pdp_npi=2%40dis%21ILS%21861.0%21266.91%21%21%21%21%21%400b0a0ac216625452616837499e4d53%2112000020850078768%21sea&curPageLogUid=ehbKLbbMGsal
https://he.aliexpress.com/item/1005004243027957.html?algo_pvid=cf9a5cc9-df53-404b-a8ed-db8a227069e6&algo_exp_id=cf9a5cc9-df53-404b-a8ed-db8a227069e6-36&pdp_ext_f=%7B%22sku_id%22%3A%2212000029030797848%22%7D&pdp_npi=2%40dis%21ILS%21723.76%21325.69%21%21%21%21%21%400b0a0ac216625452616837499e4d53%2112000029030797848%21sea&curPageLogUid=SAk2CRsHhKDu
Process finished with exit code 0

How to scrape the ratings and all the reviews from the website using selenium

I want to scrape the rating and all the reviews on the page .But not able to find the path .
enter code here
import urllib.request
from bs4 import BeautifulSoup
import csv
import os
from selenium import webdriver
from selenium.webdriver.support.select import Select
from selenium.webdriver.common.keys import Keys
import pandas as pd
import time
chrome_path =r'C:/Users/91940/AppData/Local/Programs/Python/Python39/Scripts/chromedriver.exe'
driver = webdriver.Chrome(executable_path=chrome_path)
driver.implicitly_wait(10)
driver.get("https://www.lazada.sg/products/samsung-galaxy-watch3-bt-45mm-titanium-i1156462257-
s4537770883.html?search=1&freeshipping=1")
product_name = driver.find_element_by_xpath('//*[#id="module_product_title_1"]/div/div/h1')
print(product_name.text)
rating = driver.find_element_by_xpath("//span[#class='score-average']")
print(rate.text)
review = driver .find_element_by_xpath('//*
[#id="module_product_review"]/div/div/div[3]/div[1]/div[1]')
print(review.text)
I believe print(product_name.text) is getting execute correct, right ?
There is an issue with driver.find_element_by_xpath("//span[#class='score-average']") I could not found score-average anywhere in HTML source.
so try this instead :
driver.find_element_by_css_selector("div.pdp-review-summary")
print(rate.text)
You can try the below code to get review :
wait = WebDriverWait(driver, 10)
driver.get("https://www.lazada.sg/products/samsung-galaxy-watch3-bt-45mm-titanium-i1156462257- s4537770883.html?search=1&freeshipping=1")
wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "a[class$='pdp-review-summary__link']"))).click()
ActionChains(driver).move_to_element(wait.until(EC.visibility_of_element_located((By.XPATH, "//h2[contains(text(), 'Ratings & Reviews')]")))).perform()
wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "div.item-content")))
for review in driver.find_elements(By.CSS_SELECTOR, "div.item-content"):
print(review.get_attribute('innerHTML'))
Imports :
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
Perhaps there is a problem with your path? (apologies I'm not on windows to test). From memory, Windows paths use \ characters instead of /. Additionally, you may need two backticks after the drive path (C:\\).
c:\\Users\91940\AppData\Local\...

Hidden phone number can't be scraped

I've been having trouble trying to extract the phone number after clicking the "llamar" button. So far I've used the xpath method with selenium and also tried using beautiful soup to extract the number but unfortunately nothing has worked. I usually get an invalid selector error (if I use an xpath selector with selenium) and with BS4 I get a - AttributeError: 'NoneType' object has no attribute 'text' ...
I hope you can help me out!
Here is the url to the link - https://www.milanuncios.com/venta-de-pisos-en-malaga-malaga/portada-alta-carlos-de-haya-carranque-386352344.htm
Heres the code that I tried:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import pandas as pd
import time
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import UnexpectedAlertPresentException
url = 'https://www.milanuncios.com/venta-de-pisos-en-malaga-malaga/portada-alta-carlos-de-haya-carranque - 386352344.htm'
path = r'C:\Users\WL-133\anaconda3\Lib\site-packages\selenium\webdriver\chrome\chromedriver.exe'
path1 = r'C:\Users\WL-133\anaconda3\Lib\site-packages\selenium\webdriver\firefox'
# driver = webdriver.Chrome(path)
options = Options()
driver = webdriver.Chrome(path)
driver.get(url)
a = []
mah_div = driver.page_source
soup = BeautifulSoup(mah_div, features='lxml')
cookie_button = '//*[#id="sui-TcfFirstLayerModal"]/div/div/footer/div/button[2]'
btn_press = driver.find_element_by_xpath(cookie_button)
btn_press.click()
llam_button = '//*[#id="ad-detail-contact"]/a[2]'
llam_press = driver.find_element_by_xpath(llam_button)
llam_press.click()
time.sleep(10)
for item in soup.find_all("div", {"class": "contenido"}):
a.append(item.find("div", {"class": "plaincontenido"}).text)
print(a)
The phone is stored inside Javascript. You can use re module to extract it:
import re
import requests
from bs4 import BeautifulSoup
url = "https://www.milanuncios.com/venta-de-pisos-en-malaga-malaga/portada-alta-carlos-de-haya-carranque-386352344.htm"
phone_url = "https://www.milanuncios.com/datos-contacto/?usePhoneProxy=0&from=detail&includeEmail=false&id={}"
ad_id = re.search(r"(\d+)\.htm", url).group(1)
html_text = requests.get(phone_url.format(ad_id)).text
soup = BeautifulSoup(html_text, "html.parser")
phone = re.search(r"getTrackingPhone\((.*?)\)", html_text).group(1)
print(soup.select_one(".texto").get_text(strip=True), phone)
Prints:
ana (Particular) 639....
With Selenium you will need to click the button and to switch to iframe.
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
wait.until(EC.element_to_be_clickable(
(By.CSS_SELECTOR, ".def-btn.phone-btn")))
tel_button = driver.find_element_by_css_selector(".def-btn.phone-btn")
tel_button.click()
wait.until(EC.frame_to_be_available_and_switch_to_it((By.ID, "ifrw")))
wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR,".texto>.telefonos")))
tel_number = driver.find_element_by_css_selector(".texto>.telefonos").text
Please note, I used much stable locators.

Scraping a dynamically loaded table with BeautifulSoup

My code could return values of first two tags, but the behind won't in per tag.
HTML:
My code:
import bs4 as bs
import requests
resp = requests.get('https://q.stock.sohu.com/cn/bk_4401.shtml')
resp.encoding = 'gb2312'
soup = bs.BeautifulSoup(resp.text, 'lxml')
tab_sgtsc_list = soup.find('table').find('tbody').find_all('tr')
for tab_sgtsc in tab_sgtsc_list:
print('**************************************')
print(tab_sgtsc.find_all('td')[0].text)
print(tab_sgtsc.find_all('td')[1].text)
print(tab_sgtsc.find_all('td')[2].text)
print(tab_sgtsc.find_all('td')[3].text)
print('**************************************')
Result:
The table is rendered dynamically by JavaScript so you won't get much from pure HTML.
However, selenium and pandas come to the rescue!
Required:
Chrome driver
selenium
pip install pandas
Here's how:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
options = Options()
options.headless = True
driver = webdriver.Chrome(options=options)
driver.get("https://q.stock.sohu.com/cn/bk_4401.shtml")
wait = WebDriverWait(driver, 10)
element = wait.until(
EC.visibility_of_element_located((By.CSS_SELECTOR, 'table.tableMSB'))
).text.replace("点击按代码排序查询", "").split()
table = [element[i:i + 12] for i in range(0, len(element), 12)]
pd.DataFrame(table[1:], columns=table[0]).to_csv("your_table_data.csv", index=False)
Output:

Categories