I am going to download pictures from a clothing website for academic research, I use the code below
`
from ast import keyword
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.service import Service
import time
import os
import wget
import random
import time
delay_choices = range(5,15)
delay = random.choice(delay_choices)
import requests
from fake_useragent import UserAgent
keyword = "jeans"
user_agent = UserAgent()
response = requests.get(url="https://www2.hm.com/en_asia3/ladies/shop-by-product/jeans.html", headers={ 'user-agent': user_agent.random })
driver = webdriver.Chrome("~~~~")
driver.get("https://www2.hm.com/en_asia3/ladies/shop-by-product/jeans.html")
time.sleep(4)
cookie = driver.find_element(By.ID, 'onetrust-accept-btn-handler')
cookie.click()
time.sleep(2)
for i in range(6):
driver.execute_script("window.scrollTo(0, 6900);")
time.sleep(delay)
loadmore = driver.find_element(By.XPATH,"/html/body/main/div/div/div/div[3]/div[2]/button")
loadmore.click()
imgs = driver.find_elements(By.CLASS_NAME, 'item-image')
path = os.path.join("H&M" + keyword)
os.mkdir(path)
count = 0
for img in imgs:
save_as = os.path.join(path, keyword + str(count) + '.jpg')
#print(img.get_attribute("src"))
wget.download(img.get_attribute("src"), save_as)
count += 1
time.sleep(6)
driver.quit()
`
and I got this issue:
in this line: wget.download(img.get_attribute('src'), save_as)
but I also use the "src" to download other website and didn't wrong.
I would wonder if anyone know what happen. 😢
Thanks a lot.
I have searched and couldn't solve this problem, and I hope someone can give me some advice.
enter image description here
The url you are trying to download using wget has lots of specific symbols and this can cause problems for the wget. This is an example of the URL you are attempting to download from: https://lp2.hm.com/hmgoepprod?set=source[/2b/bf/2bbf11a29fde773adcdOK],res[y],hmver[1]&call=url[file:/product/main]
Try to change the command a bit:
Instead of this:
wget.download(img.get_attribute("src"), save_as)
try this:
wget.download(f'"{img.get_attribute("src")}"', save_as)
Related
I'm trying to collect all (5) of the social media links from the artist in this example. Currently, my output is only the LAST (fifth) social media link. I'm using selenium, I understand this my not be the best option for collecting this data but its all I know at this time.
Note, I've only included relevant code for my question. Thank you in advance for any help/insight.
from cgitb import text
from os import link
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
import time
from random import randint
import pandas as pd
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('disable-infobars')
chrome_options.add_argument('--disable-extensions')
chrome_options.add_experimental_option('excludeSwitches', ['enable-automation'])
driver = webdriver.Chrome(chrome_options=chrome_options)
for url in urls:
driver.get(https://soundcloud.com/flux-pavilion)
time.sleep(randint(3,4))
try:
links = driver.find_elements_by_xpath('//*[#id="content"]/div/div[4]/div[2]/div/article[1]/div[2]/ul/li//a[#href]')
for elem in links:
socialmedia = (elem.get_attribute("href"))
except:
links = "none"
artist = {
'socialmedia': socialmedia,
}
print(artist)
The problem is not with your XPath-expression, but rather with the (non-existent) list processing of your output code.
Your code output'ed only the last item of the resulting XPath list. That was the problem why you only received one link (it was the last one).
So change the output part of your code to
[...]
url = driver.get("https://soundcloud.com/flux-pavilion")
time.sleep(randint(3,4))
artist = []
try:
links = driver.find_elements_by_xpath('//*[#id="content"]/div/div[4]/div[2]/div/article[1]/div[2]/ul/li//a[#href]')
for elem in links:
artist.append(elem.get_attribute("href"))
except:
links = "none"
for link in artist:
print(link)
And the output will contain all of the values(links) you desire:
driver = webdriver.Chrome(chrome_options=chrome_options)
https://gate.sc/?url=https%3A%2F%2Ftwitter.com%2FFluxpavilion&token=da4a8d-1-1653430570528
https://gate.sc/?url=https%3A%2F%2Finstagram.com%2FFluxpavilion&token=277ea0-1-1653430570529
https://gate.sc/?url=https%3A%2F%2Ffacebook.com%2FFluxpavilion&token=4c773c-1-1653430570530
https://gate.sc/?url=https%3A%2F%2Fyoutube.com%2FFluxpavilion&token=1353f7-1-1653430570531
https://gate.sc/?url=https%3A%2F%2Fopen.spotify.com%2Fartist%2F7muzHifhMdnfN1xncRLOqk%3Fsi%3DbK9XeoW5RxyMlA-W9uVwPw&token=bc2936-1-1653430570532
I hope everyone is having a good day. I am trying to extract values from a website and have them print out as a list, but I can't figure out how to do that. I have all the values printing as expecting, just can't figure out how to have them print one after another. I know this is a very basic question, but I can't figure it out. Any advice or information is appreciated! Thank you!
import time
import webbrowser
from os import O_SEQUENTIAL, link
import chromedriver_autoinstaller
from selenium import webdriver as wd
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
webdriver = wd.Chrome(executable_path= r"C:\Users\Stephanie\anaconda3\pkgs\python-chromedriver-binary-98.0.4758.48.0-py39hcbf5309_0\Lib\site-packages\chromedriver_binary\chromedriver.exe")
webdriver.implicitly_wait(1)
webdriver.maximize_window()
webdriver.get("https://pcpartpicker.com/user/stephwaters/saved/#view=HgH2xr")
time.sleep(2)
partname = webdriver.find_elements(By.CLASS_NAME, 'td__component')
for part in partname:
print(part.text + ': ')
prices = webdriver.find_elements(By.CLASS_NAME, 'td__price')
for price in prices:
print(price.text)
This is the output:
I would like it to print:
Case: $168.99
Power Supply: $182.00
and so on.
Instead of getting the partnames and prices separately you can iterate over a list of products extracting from each one it's name and price.
Also it's recommended to use Expected Conditions explicit waits, not a hardcoded pauses.
Your code could be something like this:
import time
import webbrowser
from os import O_SEQUENTIAL, link
import chromedriver_autoinstaller
from selenium import webdriver as wd
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
webdriver = wd.Chrome(executable_path= r"C:\Users\Stephanie\anaconda3\pkgs\python-chromedriver-binary-98.0.4758.48.0-py39hcbf5309_0\Lib\site-packages\chromedriver_binary\chromedriver.exe")
wait = WebDriverWait(webdriver, 20)
webdriver.maximize_window()
webdriver.get("https://pcpartpicker.com/user/stephwaters/saved/#view=HgH2xr")
wait.until(EC.visibility_of_element_located((By.XPATH, "//tr[#class='tr__product']")))
time.sleep(0.3) #short delay added to make sure not the first product only got loaded
products = = webdriver.find_elements(By.XPATH, '//tr[#class="tr__product"]')
for product in products:
name = product.find_element(By.XPATH, './/td[#class="td__component"]')
price = product.find_element(By.XPATH, './/td[#class="td__price"]//a')
print(name.text + ': ' + price.text)
I'm using selenium to make a headless scraping of a website within an endpoint of an API using Flask for Python. I made several tests and my selenium scraping code works perfectly within a script and while running as an API in the localhost. However, when I deploy the code in a remote server, the requests always return a 502 Bad Gateway error. It is weird because by logging I can see that the scraping is working correctly, but the server responds with 502 before the scraping finish processing, as if it was trying to set up a proxy and it fails. I also noticed that removing the time.sleep in my code makes it return a 200 although the result could be wrong because it doesn't give selenium the proper time to load the all the page to scrape.
I also tried to set up to use falcon instead of flask and I get a similar error. This is a sample of my recent code using Falcon:
class GetUrl(object):
def on_get(self, req, resp):
"""
Get Request
:param req:
:param resp:
:return:
"""
# read parameter
req_body = req.bounded_stream.read()
json_data = json.loads(req_body.decode('utf8'))
url = json_data.get("url")
# get the url
options = Options()
options.add_argument("--headless")
driver = webdriver.Firefox(firefox_options=options)
driver.get(url)
time.sleep(5)
result = False
# check for outbound links
content = driver.find_elements_by_xpath("//a[#class='_52c6']")
if len(content) > 0:
href = content[0].get_attribute("href")
result = True
driver.quit()
# make the return
return_doc = {"result": result}
resp.body = json.dumps(return_doc, sort_keys=True, indent=2)
resp.content_type = 'text/string'
resp.append_header('Access-Control-Allow-Origin', "*")
resp.status = falcon.HTTP_200
I saw some other similar issues like this, but even though I can see that there is a gunicorn running in my server, I don't have nginx, or at least it is not running where it should running. And I don't think Falcon uses it. So, what exactly am I doing wrong? Some light in this issue is highly appreciated, thank you!
This might work:
from IPython.display import clear_output
import time as time
import json
!apt-get update
!apt install chromium-chromedriver
!which chromedriver
!pip install selenium
import selenium
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.expected_conditions import presence_of_element_located
!pip install page_objects
import page_objects
from page_objects import PageObject, PageElement
time.sleep(1)
clear_output()
class GetUrl(object):
def on_get(self, req, resp):
"""
Get Request
:param req:
:param resp:
:return:
"""
# read parameter
req_body = req.bounded_stream.read()
json_data = json.loads(req_body.decode('utf8'))
url = json_data.get("https://stackoverflow.com/questions/69038958/selenium-flask-falcon-in-python-502-bad-gateway-error/69546175#69546175")
# get the url
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
driver = webdriver.Chrome('chromedriver',options = options)
driver.implicitly_wait(3)
driver.get("https://stackoverflow.com/questions/69038958/selenium-flask-falcon-in-python-502-bad-gateway-error/69546175#69546175")
result = False
# check for outbound links
contentStorage = []
content = driver.find_elements_by_tag_name('a')
for i in content:
contentStorage.append(i.get_attribute('text'))
result = True
#driver.quit()
# make the return
return_doc = {"result": result}
resp.body = json.dumps(return_doc, sort_keys=True, indent=2)
resp.content_type = 'text/string'
resp.append_header('Access-Control-Allow-Origin', "*")
resp.status = falcon.HTTP_200
However, I was testing it without using a class object, and also it's using Chrome instead of FireFox:
from IPython.display import clear_output
import time as time
!apt-get update
!apt install chromium-chromedriver
!which chromedriver
!pip install selenium
import selenium
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.expected_conditions import presence_of_element_located
!pip install page_objects
import page_objects
from page_objects import PageObject, PageElement
time.sleep(1)
clear_output()
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
driver = webdriver.Chrome('chromedriver',options = options)
driver.implicitly_wait(3)
driver.get('https://stackoverflow.com/questions/69038958/selenium-flask-falcon-in-python-502-bad-gateway-error/69546175#69546175')
content = driver.find_elements_by_tag_name('a')
contentStorage = []
for i in content:
contentStorage.append(i.get_attribute('text'))
#driver.quit()
I want to scrape the rating and all the reviews on the page .But not able to find the path .
enter code here
import urllib.request
from bs4 import BeautifulSoup
import csv
import os
from selenium import webdriver
from selenium.webdriver.support.select import Select
from selenium.webdriver.common.keys import Keys
import pandas as pd
import time
chrome_path =r'C:/Users/91940/AppData/Local/Programs/Python/Python39/Scripts/chromedriver.exe'
driver = webdriver.Chrome(executable_path=chrome_path)
driver.implicitly_wait(10)
driver.get("https://www.lazada.sg/products/samsung-galaxy-watch3-bt-45mm-titanium-i1156462257-
s4537770883.html?search=1&freeshipping=1")
product_name = driver.find_element_by_xpath('//*[#id="module_product_title_1"]/div/div/h1')
print(product_name.text)
rating = driver.find_element_by_xpath("//span[#class='score-average']")
print(rate.text)
review = driver .find_element_by_xpath('//*
[#id="module_product_review"]/div/div/div[3]/div[1]/div[1]')
print(review.text)
I believe print(product_name.text) is getting execute correct, right ?
There is an issue with driver.find_element_by_xpath("//span[#class='score-average']") I could not found score-average anywhere in HTML source.
so try this instead :
driver.find_element_by_css_selector("div.pdp-review-summary")
print(rate.text)
You can try the below code to get review :
wait = WebDriverWait(driver, 10)
driver.get("https://www.lazada.sg/products/samsung-galaxy-watch3-bt-45mm-titanium-i1156462257- s4537770883.html?search=1&freeshipping=1")
wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "a[class$='pdp-review-summary__link']"))).click()
ActionChains(driver).move_to_element(wait.until(EC.visibility_of_element_located((By.XPATH, "//h2[contains(text(), 'Ratings & Reviews')]")))).perform()
wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "div.item-content")))
for review in driver.find_elements(By.CSS_SELECTOR, "div.item-content"):
print(review.get_attribute('innerHTML'))
Imports :
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
Perhaps there is a problem with your path? (apologies I'm not on windows to test). From memory, Windows paths use \ characters instead of /. Additionally, you may need two backticks after the drive path (C:\\).
c:\\Users\91940\AppData\Local\...
I am trying to get the download link and download the files.
I hava a log file which contains following links:
http://www.downloadcrew.com/article/18631-aida64
http://www.downloadcrew.com/article/4475-sumo
http://www.downloadcrew.com/article/2174-iolo_system_mechanic_professional
...
...
I have a code like this:
import urllib, time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
f = open("dcrewtest.txt")
for line in f.readlines():
try:
driver.find_element_by_xpath("//div/div[2]/div[2]/div[2]/div[3]/div/a/img").click()
time.sleep(8)
except:
pass
url = line.encode
pageurl = urllib.urlopen(url).read()
soup = BeautifulSoup(pageurl)
for a in soup.select("h1#articleTitle"):
print a.contents[0].strip()
for b in soup.findAll("th"):
if b.text == "Date Updated:":
print b.parent.td.text
elif b.text == "Developer:":
print c.parent.td.text
Up till here I do not know how to get the download link and download it.
Is it possible to download the file using selenium?
According to documentation, you should configure FirefoxProfile to automatically download files with a specified content-type. Here's an example using your first URL in the txt file that saves the exe file in the current directory:
import os
from selenium import webdriver
fp = webdriver.FirefoxProfile()
fp.set_preference("browser.download.folderList",2)
fp.set_preference("browser.download.manager.showWhenStarting",False)
fp.set_preference("browser.download.dir", os.getcwd())
fp.set_preference("browser.helperApps.neverAsk.saveToDisk", "application/x-msdos-program")
driver = webdriver.Firefox(firefox_profile=fp)
driver.get("http://www.downloadcrew.com/article/18631-aida64")
driver.find_element_by_xpath("//div[#class='downloadLink']/a/img").click()
Note, that I've also simplified the xpath.