I would like to save all images from a pinterest board. I am having trouble writing the process to go back to the board and go to the next image after downloading the image, and I would appreciate it if you could help me out.
Board example:https://www.pinterest.jp/aku_ma/%E3%82%A2%E3%83%8B%E3%83%A1%E3%82%A2%E3%82%A4%E3%82%B3%E3%83%B3/
Login
Access the board ←I have done this.
Access the page of the image in the board
Press the download button and save to the specified path
Return to the board and access the page for the next image
ボードにアクセスするまでのコード
import os
import selenium
import time
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
url ='https://www.pinterest.jp/aku_ma/%E3%82%A2%E3%83%8B%E3%83%A1%E3%82%A2%E3%82%A4%E3%82%B3%E3%83%B3/'
profilefolder = '--user-data-dir=' + '/Users/t/Library/Application Support/Google/Chrome/Default'
emailAdress = 'xxxx#gmail.com'
passwordNumber='xxxx'
foldername="/Users/t/Desktop/koreanLikeImages"
speed = 1
options = Options()
# options.add_argument('--headless')
DRIVER_PATH = "./chromedriver" # My ChromeDrivers Path
driver = webdriver.Chrome(options=options)
driver.get(url)
loginButton = driver.find_element(By.CSS_SELECTOR, "div[data-test-id='login-button']")
loginButton.click()#Push at login button
time.sleep(1)
#Enter ID,Pass
email = driver.find_element(By.ID,"email")
email.send_keys(emailAdress)
password = driver.find_element(By.ID,"password")
password.send_keys(passwordNumber)
# Push The Red Login Button
redLoginButton = driver.find_element(By.CLASS_NAME, "SignupButton")
redLoginButton.click()
time.sleep(3)
driver.get(url)
Steps 3, 4 and 5 are not necessary because when you are in the main page the high resolution links are already loaded in the html. For example this is the html code of an image
<img ... srcset="
https://i.pinimg.com/236x/80/c8/ec/80c8ec56386197561bac4c4e40d331b8.jpg 1x,
https://i.pinimg.com/474x/80/c8/ec/80c8ec56386197561bac4c4e40d331b8.jpg 2x,
https://i.pinimg.com/736x/80/c8/ec/80c8ec56386197561bac4c4e40d331b8.jpg 3x,
https://i.pinimg.com/originals/80/c8/ec/80c8ec56386197561bac4c4e40d331b8.jpg 4x">
As you can see, each image has 4 urls, each url is the image at a different resolution, and the 4x has the highest resolution. Using urllib.request.urlretrieve(url) we can download the file associated to url, so we can download the images in high quality directly on the homepage.
import urllib.request
from selenium.common.exceptions import StaleElementReferenceException
foldername = 'C://Users//gtu//Desktop//folder//'
urls = []
new_images = False
while 1:
images = driver.find_elements(By.CSS_SELECTOR, 'img[srcset]')
for img in images:
try:
url = img.get_attribute('srcset').split(',')[-1].split()[0] # [-1] selects the larget resolution
except StaleElementReferenceException:
# as you scroll down old images are removed from the html, so it may raise this error but it's not a real problem
continue
if url not in urls:
# scroll down so that new images are loaded
driver.execute_script('arguments[0].scrollIntoView({block: "center", behavior: "smooth"});', img)
urls.append(url)
print(url)
new_images = True
file_name = url.split('/')[-1]
# download the image
urllib.request.urlretrieve(url, foldername + file_name)
time.sleep(1)
# if there are no new images it means we reached the bottom of the page
if not new_images:
break
else:
new_images = False
Related
I am trying to screenshot an image located inside an iframe in an ads creative in headless mode.
Indeed, I will have to screenshot many of such iframes and the final script will run on a remote server.
No matter what I have tried, screenshots always seem to be cropped when I use the headless mode of selenium.
I have seen that a few posts exist on this subject, but none of them have solved my issue.
Here is a list of things I already tried:
Using either Firefox or Chrome webdrivers didn't help.
Using different combinations of waits conditions didn't help either.
Below, there is a MWE of the code I am trying to run:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.firefox.options import Options as OptionsFirefox
from selenium.webdriver.chrome.options import Options as OptionsChrome
from PIL import Image
from io import BytesIO
test_url = "https://cdn-creatives.adikteev.com/Creatives/demoLink/MLEngine/index.html?MRAID_320X480_AWEM_CradleEmpires_Aug20/creative-e03f09e5.min.js"
id_iframe = "mainIframe"
# Setting up the driver.
# options = OptionsFirefox()
# options.headless = True
# driver = webdriver.Firefox(options=options)
options = OptionsChrome()
options.headless = True
driver = webdriver.Chrome(options=options)
# Getting the url.
driver.get(test_url)
WebDriverWait(driver, 20).until(EC.visibility_of_all_elements_located((By.ID, id_iframe)))
# Getiing the iframe with its informations:
elem = driver.find_element(By.ID, id_iframe)
#
location = elem.location
size = elem.size
left = location['x']
top = location['y']
right = location['x'] + size['width']
bottom = location['y'] + size['height']
ic(elem.location)
ic(elem.size)
# Waits (might help ?).
WebDriverWait(driver, 20).until(EC.frame_to_be_available_and_switch_to_it(id_iframe))
# Saving screenshots:
# Complete screenshot.
img_png = driver.get_screenshot_as_png()
driver.save_screenshot("full_screen_headless_on.png")
img_crop = Image.open(BytesIO(img_png))
img_crop = img_crop.crop((left, top, right, bottom)) # defines crop points
# Screenshot cropped to the Iframe.
img_crop.save( "iframe_screen_headless_on.png" ) # saves new cropped image
driver.quit()
If someone has a solution, that will be greatly appreciated :-) !
I had the same issue with Selenuium.
In my case additional waiting after resolving the URL helped, for instance:
...
driver.get(url)
time.sleep(10)
WebDriverWait(driver, 20).until(
EC.frame_to_be_available_and_switch_to_it((By.ID, id_iframe))
)
...
I can't actually explain why it works like that, I didn't deep into the docs, but it helped.
I try to scrap my deezer music but, when I scroll the site, selenium skips a lot of music, selenium skips the first 30 music, displays 10, then skips another 30, etc. until the end of the page.
Here is the code:
import selenium
from selenium import webdriver
path = "./chromedriver"
driver = webdriver.Chrome(executable_path=path)
url = 'https://www.deezer.com/fr/playlist/2560242784'
driver.get(url)
for i in range(0,20):
try :
driver.execute_script("window.scrollTo(0, document.body.scrollHeight)")
musics = driver.find_elements_by_class_name('BT3T6')
for music in musics:
print (music.text)
except Exception as e:
print(e)
I've tried to scrape the page based on your code and ended up with success.
I've decided to scroll the page by 500px per step and then remove all duplications and empty strings.
import selenium
import time
from selenium import webdriver
path = "./chromedriver"
driver = webdriver.Chrome(executable_path=path)
url = 'https://www.deezer.com/fr/playlist/2560242784'
driver.get(url)
all_music = []
last_scroll_y = driver.execute_script("return window.scrollY")
for i in range(0, 100):
try :
#first scrape
musics = driver.find_elements_by_class_name('BT3T6')
for music in musics:
all_music.append(music.text)
#then scroll down +500px
driver.execute_script("window.scrollTo(0, window.scrollY+500);")
time.sleep(0.2) #some wait for the new content (200ms)
current_scroll_y = driver.execute_script("return window.scrollY")
# exit the loop if the page is not scrolled any more
if current_scroll_y == last_scroll_y:
break
last_scroll_y = current_scroll_y
except Exception as e:
print(e)
# this removes all empty strings
all_music = list(filter(None, all_music))
# this removes all duplications, but keeps the order
# based on https://stackoverflow.com/a/17016257/5226491
# python 3.7 required
all_music = list(dict.fromkeys(all_music))
# this also removes all duplications, but the order will be changed
#all_music = list(set(all_music))
for m in all_music:
print(m)
print('Total music found: ' + len(all_music))
This works ~ 60-90 seconds and scrape 1000+ items.
Note: it works fine with the active window, and also works in headless mode, but it finish scraping when I collapse the browser window.. So run this with headless chrome option
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
options = Options()
options.headless = True
driver = webdriver.Chrome(CHROMEDRIVER_PATH, options=options)
or do not collapse the window.
I have implemented full page screenshot by selenium webdriver.
The code as follow
import time
import os
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
jenkinsJobName = os.getenv("JOB_NAME")
url = "https://www.ccode.com/sg"
save_fn = "testResult.PNG"
option = webdriver.ChromeOptions()
option.add_argument('--headless')
option.add_argument('--disable-gpu')
option.add_argument("--window-size=1280,1024")
option.add_argument("--hide-scrollbars")
driver = webdriver.Chrome(chrome_options=option)
driver.get(url)
print(driver.title)
scroll_width = driver.execute_script('return document.body.parentNode.scrollWidth')
scroll_height = driver.execute_script('return document.body.parentNode.scrollHeight')
driver.set_window_size(scroll_width, scroll_height)
driver.save_screenshot(save_fn)
driver.quit()
that is working fine.
But I use below code to capture full page screenshot in mobile
import time
import os
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
jenkinsJobName = os.getenv("JOB_NAME")
url = "https://www.ccode.com/sg"
save_fn = "testResyyult.PNG"
option = webdriver.ChromeOptions()
option.add_argument('--headless')
option.add_argument('--disable-gpu')
mobile_emulation = {"deviceName": "iPhone 6"}
option.add_experimental_option("mobileEmulation", mobile_emulation)
option.add_argument("--auto-open-devtools-for-tabs")
driver = webdriver.Chrome(chrome_options=option)
driver.get(url)
print(driver.title)
scroll_width = driver.execute_script('return document.body.parentNode.scrollWidth')
scroll_height = driver.execute_script('return document.body.parentNode.scrollHeight')
driver.set_window_size(scroll_width, scroll_height)
driver.save_screenshot(save_fn)
driver.quit()
The result image just take half of page, not for full page like in the first segement code.
enter image description here
How can I fix this code?
Thanks
You need to use Firefox for full-page screenshots:
https://selenium-python.readthedocs.io/api.html#module-selenium.webdriver.firefox.webdriver
Here is your code modified:
I have not run this code in context to your use case but I have used it as a part of my own project to get fullscreen screenshots, you will need to check the compatibility of other parameters that you have described with firefox.
import time
import os
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
jenkinsJobName = os.getenv("JOB_NAME")
url = "https://www.ccode.com/sg"
save_fn = "testResyyult.PNG"
option = webdriver.FirefoxOptions() # ------ Changes here
option.add_argument('--headless')
option.add_argument('--disable-gpu')
mobile_emulation = {"deviceName": "iPhone 6"}
option.add_experimental_option("mobileEmulation", mobile_emulation)
option.add_argument("--auto-open-devtools-for-tabs")
driver = webdriver.Firefox(options=option) # ------ Changes here
driver.get(url)
print(driver.title)
scroll_width = driver.execute_script('return document.body.parentNode.scrollWidth')
scroll_height = driver.execute_script('return document.body.parentNode.scrollHeight')
driver.set_window_size(scroll_width, scroll_height)
driver.save_full_page_screenshot(save_fn) # ------ Changes here This is the main change, search for this method in the documentation
driver.quit()
Also you might want to check the URL that you are taking a screenshot of. Seems like it is unresponsive.
Edit: I also noticed that you want a mobile view. Unfortunately, I think firefox does not return a valid mobile view even after setting
option.enable_mobile
Hey Guys i am trying to scrape some data from aliexpress but whenever i want to access any url, it ask me to login before accessing the page.i don't know how to automatically login into website, some of you may use use cookies but i don't know how to use cookies, Here is my code :
import requests
from bs4 import BeautifulSoup
import csv
from selenium import webdriver
g = csv.writer(open('aliexpressnew.csv', 'a',newline='',encoding="utf-8"))
#g.writerow(['Product Name','Price','Category','Subcategory'])
links = [
"https://www.aliexpress.com/category/205838503/iphones.html?spm=2114.search0103.0.0.6ab01fbbfe33Rm&site=glo&g=n&needQuery=n&tag="
]
for i in links:
getlink = i
while getlink != 0:
chromepath = 'C:\\Users\Faisal\Desktop\python\chromedriver.exe'
driver = webdriver.Chrome(chromepath)
driver.get(getlink)
soup = BeautifulSoup(driver.page_source, 'html.parser')
a
if itemsname1.find(class_='img-container left-block util-clearfix').find(class_='img').find(class_='picRind j-p4plog'):
if itemsname1.find(class_='img-container left-block util-clearfix').find(class_='img').find(class_='picRind j-p4plog').find('img').get('src'):
image = itemsname1.find(class_='img-container left-block util-clearfix').find(class_='img').find(class_='picRind j-p4plog').find('img').get('src')
else:
image = itemsname1.find(class_='img-container left-block util-clearfix').find(class_='img').find(class_='picRind j-p4plog').find('img').get('image-src')
else :
if itemsname1.find(class_='img-container left-block util-clearfix').find(class_='img').find(class_='picRind ').find('img').get('src'):
image = itemsname1.find(class_='img-container left-block util-clearfix').find(class_='img').find(class_='picRind ').find('img').get('src')
else:
image = itemsname1.find(class_='img-container left-block util-clearfix').find(class_='img').find(class_='picRind ').find('img').get('image-src')
image3 = 'http:'+ str(image)
print(title)
print(price)
#print(rating2)
print(image3)
g.writerow([title,price,subcat2,image])
next1 = soup.find(class_='ui-pagination-navi util-left')
if next1.find(class_="page-end ui-pagination-next ui-pagination-disabled"):
getlink=0
else:
next22 = next1.find(class_='page-next ui-pagination-next')
next3 = "http:" + next22.get('href')
getlink = next3
driver.close()
It sounds like you're getting the username password browser prompt that appears before any page content appears, if that's the case you can navigate to the following uri:
http://<username>:<password>#your-url-here.com
for example:
http://foo:bar#example.com
You can load a chrome profile automatically with the credentials stored to avoid to do login manually
How to open URL through default Chrome profile using Python Selenium Webdriver
You have to add an chrome options to the webdriver
options = webdriver.ChromeOptions()
# paths chrome in windows
options.add_argument("user-data-dir=C:/Users/NameUser/AppData/Local/Google/Chrome/User Data")
options.add_argument("profile-directory=Default")
driver = webdriver.Chrome(chromepath, chrome_options=options)
Make sure you have stored credentials logged into the website when you start chrome normally
First, you need to authenticate after opening your website through the Selenium driver.
Actually you don't really need cookies to do that.
You first need to inspect element to find IDs to reach them with your driver then use send_keys to fill in the input :
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
delay = 10 // seconds before timout
chromepath = 'C:\\Users\Faisal\Desktop\python\chromedriver.exe'
driver = webdriver.Chrome(chromepath)
driver.get(ALI_EXPRESS_LINK)
# In order to wait the full loading of the page
# (Actually waits for the input of the login part, you can find the id by inspecting element, see attached picture)
WebDriverWait(driver, delay).until(EC.presence_of_element_located((By.ID, "fm-login-id")))
element = driver.find_element_by_id("fm-login-id")
element.send_keys(YOUR_LOGIN_ID)
# Doing the same for the password
element = driver.find_element_by_id("fm-login-password")
element.send_keys(YOUR_PASSWORD)
# Then click the submit button
driver.find_element_by_class_name("password-login").click()
Don't forget to define :
ALI_EXPRESS_LINK
YOUR_LOGIN_ID
YOUR_PASSWORD
:)
Attached :
My code :
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
# execute url
url = "https://www.youtube.com/user/xuanvinh1612/community"
driver_path = ('F:/chromedriver.exe')
browser = webdriver.Chrome(executable_path=driver_path)
browser.get(url)
# Auto scroll and auto click with text:'Read more'
read_mores2 = browser.find_elements_by_link_text('Read more')
for read_mores2 in read_mores2:
browser.execute_script("arguments[0].scrollIntoView();", read_mores2)
browser.execute_script("$(arguments[0]).click();", read_mores2)
# Scroll down stop when all post was showed
read_mores2 = browser.find_elements_by_link_text('Read more')
With a same code, my code can run some website(2-3 another website). But when i re-use code for auto scroll down and auto click on Youtube/community, it not working. I dont know how it not work. I need help, please.
Try this code:
It will first load all the pages, then click on all Read More.
import time
from selenium import webdriver
# execute url
url = "https://www.youtube.com/user/xuanvinh1612/community"
browser = webdriver.Chrome()
browser.get(url)
# Auto scroll and auto click with text:'Read more'
previous_count = 0
page_sections = browser.find_elements_by_css_selector('.style-scope.ytd-item-section-renderer')
current_count = len(page_sections)
print("Scrolling to enable all the pages")
while previous_count != current_count:
try:
previous_count = current_count
browser.execute_script("arguments[0].scrollIntoView();", page_sections[-1])
print("Number of total Elements found: {}".format(len(page_sections)))
finally:
# As the page load the newer elements, you need to implement logic here to wait until the loading spinner at the
# button becomes invisible (not attached to the DOM)
time.sleep(2) # WorkAround as you need to implement the above logic here
page_sections = browser.find_elements_by_css_selector('.style-scope.ytd-item-section-renderer')
current_count = len(page_sections)
print("Clicking on all Read More")
for read_more in browser.find_elements_by_css_selector('.more-button'):
browser.execute_script("arguments[0].scrollIntoView();", read_more)
browser.execute_script("arguments[0].click();", read_more)