retrieve all car links from dynamic page - python

from selenium import webdriver
options = webdriver.ChromeOptions()
options.add_argument("--user-agent='Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36'")
#options.add_argument("headless")
driver=webdriver.Chrome(executable_path="/home/timmy/Python/chromedriver",chrome_options=options)
url="https://turo.com/search?country=US&defaultZoomLevel=7&endDate=03%2F20%2F2019&endTime=10%3A00&international=true&isMapSearch=false&itemsPerPage=200&location=Colorado%2C%20USA&locationType=City&maximumDistanceInMiles=30&northEastLatitude=41.0034439&northEastLongitude=-102.040878&region=CO&sortType=RELEVANCE&southWestLatitude=36.992424&southWestLongitude=-109.060256&startDate=03%2F15%2F2019&startTime=10%3A00"
driver.get(url)
list_of_all_car_links=[]
x=0
while True:
html=driver.page_source
soup = BeautifulSoup(html, "html.parser")
for i in soup.find_all("a", href=True):
if i['href'].startswith("/rentals") and len(i['href']) > 31 :
link2="https://turo.com"+i['href']
list_of_all_car_links.append(link2)
try:
x=scrolldown(last_height=x)
except KeyError:
#driver.close()
break
i tried scolling down and then finding links but i only got part here is my scroll down function:
def scrolldown(last_height=0,SCROLL_PAUSE_TIME=3,num_tries = 2):
# Scroll down to bottom
driver.execute_script("window.scrollTo(0, document.body.scrollHeight)")
# Wait to load page
time.sleep(SCROLL_PAUSE_TIME)
new_height = driver.execute_script("return document.body.scrollHeight")
# break condition
if last_height == new_height:
#print("hello")
num_tries-=1
if num_tries==0:
print("Reached End of page")
raise KeyError
else:
scrolldown(last_height=last_height, SCROLL_PAUSE_TIME=2,num_tries=num_tries)
return new_height
I also tried converting html after each scroll to BeautifulSoup then find the links but didn't get all links.
what i want is to get every car link in that page.

I would use requests and the API shown in the xhr list in dev tools. Note the items per page parameter in the query string itemsPerPage=200. You can try altering this for larger result sets.
import requests
url = 'https://turo.com/api/search?country=US&defaultZoomLevel=7&endDate=03%2F20%2F2019&endTime=10%3A00&international=true&isMapSearch=false&itemsPerPage=200&location=Colorado%2C%20USA&locationType=City&maximumDistanceInMiles=30&northEastLatitude=41.0034439&northEastLongitude=-102.040878&region=CO&sortType=RELEVANCE&southWestLatitude=36.992424&southWestLongitude=-109.060256&startDate=03%2F15%2F2019&startTime=10%3A00'
baseUrl = 'https://turo.com'
headers = {'Referer' : 'https://turo.com/search?country=US&defaultZoomLevel=7&endDate=03%2F20%2F2019&endTime=10%3A00&international=true&isMapSearch=false&itemsPerPage=200&location=Colorado%2C%20USA&locationType=City&maximumDistanceInMiles=30&northEastLatitude=41.0034439&northEastLongitude=-102.040878&region=CO&sortType=RELEVANCE&southWestLatitude=36.992424&southWestLongitude=-109.060256&startDate=03%2F15%2F2019&startTime=10%3A00',
'User-Agent' : 'Mozilla/5.0'}
r = requests.get(url, headers = headers).json()
results = []
for item in r['list']:
results.append(baseUrl + item['vehicle']['url'])
print(results)

Related

Scraping Google App All Reviews using Selenium and Python

I want to scrape all reviews from google play store for a particular app. I have prepared following script:
# App Reviews Scraper
import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
url = "https://play.google.com/store/apps/details?id=com.android.chrome&hl=en&showAllReviews=true"
# make request
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
driver.get(url)
SCROLL_PAUSE_TIME = 5
# Get scroll height
last_height = driver.execute_script("return document.body.scrollHeight")
time.sleep(SCROLL_PAUSE_TIME)
while True:
# Scroll down to bottom
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
# Wait to load page
time.sleep(SCROLL_PAUSE_TIME)
# Calculate new scroll height and compare with last scroll height
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
# Get everything inside <html> tag including javscript
html = driver.execute_script("return document.getElementsByTagName('html')[0].innerHTML")
soup = BeautifulSoup(html, 'html.parser')
reviewer = []
date = []
# review text
for span in soup.find_all("span", class_="X43Kjb"):
reviewer.append(span.text)
# review date
for span in soup.find_all("span", class_="p2TkOb"):
date.append(span.text)
print(len(reviewer))
print(len(date))
However, it always showing 203 only. There are 35,474,218 number of reviews. So, how can I download all the reviews?
wait=WebDriverWait(driver,1)
try:
wait.until(EC.element_to_be_clickable((By.XPATH,"//span[text()='Show More']"))).click()
except:
continue
Just add this to check for the show more element in your infinite scroll.
Import:
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
An easier method to scrape app data from play store
!pip install google_play_scraper
from google_play_scraper import app
#US Market Google play store reviews
from google_play_scraper import Sort, reviews_all
us_reviews = reviews_all(
'add the app id here-using the string mentioned after id value in your code', # use the id from the play
store hyperlink that you have used above
sleep_milliseconds=0, # defaults to 0
lang='en', # defaults to 'en, can change to other lang as well'
country='us', # defaults to 'us'
sort=Sort.NEWEST, # defaults to Sort.MOST_RELEVANT
)
Convert into a data frame
df = pd.DataFrame(np.array(us_reviews ),columns=['review'])
df = df.join(pd.DataFrame(df.pop('review').tolist()))
I think there's no way to extract all reviews due to Google's limit. For example, com.collectorz.javamobile.android.books app has 2470 reviews and 879 is actually being shown after scrolling to the very end of reviews, which is 64.41% decrease change.
Calculation example:
(879 - 2470)/2470 = -64.41% (64.41% decrease)
In the Chrome dev tools after scrolling to the very end of reviews:
$$(".X5PpBb")
[0 … 99]
[100 … 199]
[200 … 299]
[300 … 399]
[400 … 499]
[500 … 599]
[600 … 699]
[700 … 799]
[800 … 878]
length: 879 👈👈👈
In the new UI there's a Show More button that appears and execution could stop/stuck or throw an error thus fewer reviews.
To extract all available data, you need to check if the See all reviews button is present. The button may be absent if the app has few or no reviews. If the button is present, then you need to click on it and wait until the data is loaded:
# if "See all reviews" button present
if driver.find_element(By.CSS_SELECTOR, ".Jwxk6d .u4ICaf button"):
# clicking on the button
button = driver.find_element(By.CSS_SELECTOR, ".Jwxk6d .u4ICaf button")
driver.execute_script("arguments[0].click();", button)
# waiting a few sec to load comments
time.sleep(4)
When the data has loaded, you need to scroll the page. You can make a small change to your page scrolling algorithm. If the variables new_height and old_height are equal, then the program looks for the Show More button selector. If this button exists, then the program is clicking on it and proceed to the next step:
if new_height == old_height:
try:
show_more = driver.find_element(By.XPATH, "//span[text()='Show More']")
driver.execute_script("arguments[0].click();", show_more)
time.sleep(1)
except:
break
Code and full example in online IDE:
import time, lxml, re, json
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
URL = "https://play.google.com/store/apps/details?id=com.collectorz.javamobile.android.books&hl=en"
service = Service(ChromeDriverManager().install())
options = webdriver.ChromeOptions()
options.add_argument("--headless")
options.add_argument("--lang=en")
options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
driver = webdriver.Chrome(service=service, options=options)
driver.get(URL)
# if "See all reviews" button present
if driver.find_element(By.CSS_SELECTOR, ".Jwxk6d .u4ICaf button"):
# clicking on the button
button = driver.find_element(By.CSS_SELECTOR, ".Jwxk6d .u4ICaf button")
driver.execute_script("arguments[0].click();", button)
# waiting a few sec to load comments
time.sleep(4)
old_height = driver.execute_script("""
function getHeight() {
return document.querySelector('.fysCi').scrollHeight;
}
return getHeight();
""")
# scrolling
while True:
driver.execute_script("document.querySelector('.fysCi').scrollTo(0, document.querySelector('.fysCi').scrollHeight)")
time.sleep(1)
new_height = driver.execute_script("""
function getHeight() {
return document.querySelector('.fysCi').scrollHeight;
}
return getHeight();
""")
if new_height == old_height:
try:
# if "Show More" button present
show_more = driver.find_element(By.XPATH, "//span[text()='Show More']")
driver.execute_script("arguments[0].click();", show_more)
time.sleep(1)
except:
break
old_height = new_height
# done scrolling
soup = BeautifulSoup(driver.page_source, 'lxml')
driver.quit()
user_comments = []
# exctracting comments
for index, comment in enumerate(soup.select(".RHo1pe"), start=1):
comment_likes = comment.select_one(".AJTPZc")
user_comments.append({
"position": index,
"user_name": comment.select_one(".X5PpBb").text,
"user_avatar": comment.select_one(".gSGphe img").get("srcset").replace(" 2x", ""),
"user_comment": comment.select_one(".h3YV2d").text,
"comment_likes": comment_likes.text.split("people")[0].strip() if comment_likes else None,
"app_rating": re.search(r"\d+", comment.select_one(".iXRFPc").get("aria-label")).group(),
"comment_date": comment.select_one(".bp9Aid").text
})
print(json.dumps(user_comments, indent=2, ensure_ascii=False))
If you want to extract reviews much faster, you can use Google Play Product Reviews API from SerpApi. It will bypass blocks from search engines and you don't have to create the parser from scratch and maintain it.
Code example that paginates through all pages and extracts reviews:
from serpapi import GoogleSearch
from urllib.parse import urlsplit, parse_qsl
import os, json
params = {
# https://docs.python.org/3/library/os.html#os.getenv
'api_key': os.getenv('API_KEY'), # your serpapi api
"engine": "google_play_product", # serpapi parsing engine
"store": "apps", # app results
"gl": "us", # country of the search
"hl": "en", # language of the search
"product_id": "com.collectorz.javamobile.android.books" # app id
}
search = GoogleSearch(params) # where data extraction happens on the backend
reviews = []
while True:
results = search.get_dict() # JSON -> Python dict
for review in results["reviews"]:
reviews.append({
"title": review.get("title"),
"avatar": review.get("avatar"),
"rating": review.get("rating"),
"likes": review.get("likes"),
"date": review.get("date"),
"snippet": review.get("snippet"),
"response": review.get("response")
})
# pagination
if "next" in results.get("serpapi_pagination", {}):
search.params_dict.update(dict(parse_qsl(urlsplit(results.get("serpapi_pagination", {}).get("next")).query)))
else:
break
print(json.dumps(reviews, indent=2, ensure_ascii=False))
There's a Scrape All Google Play App Reviews in Python blog, which in detail shows how to extract all reviews.
Disclaimer, I work for SerpApi.

Selenium doesn't return all elements required

I'm trying to get a bunch of links of the houses from this website but it only returns only about 9 elements even though it has more elements. I also tried using Beautiful Soup but the same thing happens and it doesn't return all elements.
With Selenium:
for i in range(10):
time.sleep(1)
scr1 = driver.find_element_by_xpath('//*[#id="search-page-list-container"]')
driver.execute_script("arguments[0].scrollTop = arguments[0].scrollHeight", scr1)
link_tags = driver.find_elements_by_css_selector(".list-card-info a")
links = [link.get_attribute("href") for link in link_tags]
pprint(links)
With bs4:
headers = {
'accept-language': 'en-GB,en;q=0.8,en-US;q=0.6,ml;q=0.4',
'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko)'
'Chrome/74.0.3729.131 Safari/537.36'
}
response = requests.get(ZILLOW_URL, headers=headers)
website_content = response.text
soup = BeautifulSoup(website_content, "html.parser")
link_tags = soup.select(".list-card-info a")
link_list = [link.get("href") for link in link_tags]
pprint(link_list)
Output:
'https://www.zillow.com/b/407-fairmount-ave-oakland-ca-9NTzMK/',
'https://www.zillow.com/homedetails/1940-Buchanan-St-A-San-Francisco-CA-94115/15075413_zpid/',
'https://www.zillow.com/homedetails/2380-California-St-QZ6SFATJK-San-Francisco-CA-94115/2078197750_zpid/',
'https://www.zillow.com/homedetails/5687-Miles-Ave-Oakland-CA-94618/299065263_zpid/',
'https://www.zillow.com/b/olume-san-francisco-ca-65f3Yr/',
'https://www.zillow.com/homedetails/29-Balboa-St-APT-1-San-Francisco-CA-94118/2092859824_zpid/']
Is there any way to tackle this problem? I would really appreciate the help.
You have to scroll to each element one by one in a loop and then have to look for descendant anchor tag which has the href.
driver.maximize_window()
#driver.implicitly_wait(30)
wait = WebDriverWait(driver, 50)
driver.get("https://www.zillow.com/homes/for_rent/1-_beds/?searchQueryState=%7B%22pagination%22%3A%7B%7D%2C%22mapBounds%22%3A%7B%22west%22%3A-123.7956336875%2C%22east%22%3A-121.6368202109375%2C%22south%22%3A37.02044483468766%2C%22north%22%3A38.36482775108166%7D%2C%22isMapVisible%22%3Atrue%2C%22filterState%22%3A%7B%22price%22%3A%7B%22max%22%3A872627%7D%2C%22beds%22%3A%7B%22min%22%3A1%7D%2C%22fore%22%3A%7B%22value%22%3Afalse%7D%2C%22mp%22%3A%7B%22max%22%3A3000%7D%2C%22auc%22%3A%7B%22value%22%3Afalse%7D%2C%22nc%22%3A%7B%22value%22%3Afalse%7D%2C%22fr%22%3A%7B%22value%22%3Atrue%7D%2C%22fsbo%22%3A%7B%22value%22%3Afalse%7D%2C%22cmsn%22%3A%7B%22value%22%3Afalse%7D%2C%22fsba%22%3A%7B%22value%22%3Afalse%7D%7D%2C%22isListVisible%22%3Atrue%2C%22mapZoom%22%3A9%7D")
j = 1
for i in range(len(driver.find_elements(By.XPATH, "//article"))):
all_items = driver.find_element_by_xpath(f"(//article)[{j}]")
driver.execute_script("arguments[0].scrollIntoView(true);", all_items)
print(all_items.find_element_by_xpath('.//descendant::a').get_attribute('href'))
j = j + 1
time.sleep(2)
Imports:
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
Output :
https://www.zillow.com/b/bay-village-vallejo-ca-5XkKWj/
https://www.zillow.com/b/waterbend-apartments-san-francisco-ca-9NLgqG/
https://www.zillow.com/b/the-verdant-apartments-san-jose-ca-5XsGhW/
https://www.zillow.com/homedetails/1539-Lincoln-Ave-San-Rafael-CA-94901/80743209_zpid/
https://www.zillow.com/b/the-crossing-at-arroyo-trail-livermore-ca-5XjR44/
https://www.zillow.com/homedetails/713-Trancas-St-APT-4-Napa-CA-94558/2081608744_zpid/
https://www.zillow.com/b/americana-apartments-mountain-view-ca-5hGhMy/
https://www.zillow.com/b/jackson-arms-apartments-hayward-ca-5XxZLv/
https://www.zillow.com/b/elan-at-river-oaks-san-jose-ca-5XjLQF/
https://www.zillow.com/homedetails/San-Francisco-CA-94108/2078592726_zpid/
https://www.zillow.com/homedetails/20914-Cato-Ct-Castro-Valley-CA-94546/2068418792_zpid/
https://www.zillow.com/homedetails/1240-21st-Ave-3-San-Francisco-CA-94122/2068418798_zpid/
https://www.zillow.com/homedetails/1246-Walker-Ave-APT-207-Walnut-Creek-CA-94596/18413629_zpid/
https://www.zillow.com/b/the-presidio-fremont-ca-5Xk3QQ/
https://www.zillow.com/homedetails/1358-Noe-St-1-San-Francisco-CA-94131/2068418857_zpid/
https://www.zillow.com/b/the-estates-at-park-place-fremont-ca-5XjVpg/
https://www.zillow.com/homedetails/2060-Camel-Ln-Walnut-Creek-CA-94596/2093645611_zpid/
https://www.zillow.com/b/840-van-ness-san-francisco-ca-5YCwMj/
https://www.zillow.com/homedetails/285-Grand-View-Ave-APT-6-San-Francisco-CA-94114/2095256302_zpid/
https://www.zillow.com/homedetails/929-Oak-St-APT-3-San-Francisco-CA-94117/2104800238_zpid/
https://www.zillow.com/homedetails/420-N-Civic-Dr-APT-303-Walnut-Creek-CA-94596/18410162_zpid/
https://www.zillow.com/homedetails/1571-Begen-Ave-Mountain-View-CA-94040/19533010_zpid/
https://www.zillow.com/homedetails/145-Woodbury-Cir-D-Vacaville-CA-95687/2068419093_zpid/
https://www.zillow.com/b/trinity-towers-apartments-san-francisco-ca-5XjPdR/
https://www.zillow.com/b/hidden-creek-vacaville-ca-5XjV3h/
https://www.zillow.com/homedetails/19-Belle-Ave-APT-7-San-Anselmo-CA-94960/2081212106_zpid/
https://www.zillow.com/homedetails/1560-Jackson-St-APT-11-Oakland-CA-94612/2068419279_zpid/
https://www.zillow.com/homedetails/1465-Marchbanks-Dr-APT-2-Walnut-Creek-CA-94598/18382713_zpid/
https://www.zillow.com/homedetails/205-Morning-Sun-Ave-B-Mill-Valley-CA-94941/2077904048_zpid/
https://www.zillow.com/homedetails/1615-Pacific-Ave-B-Alameda-CA-94501/2073535331_zpid/
https://www.zillow.com/homedetails/409-S-5th-St-1-San-Jose-CA-95112/2078856409_zpid/
https://www.zillow.com/homedetails/5635-Anza-St-P5G3CZYNW-San-Francisco-CA-94121/2068419581_zpid/
https://www.zillow.com/b/407-fairmount-ave-oakland-ca-9NTzMK/
https://www.zillow.com/homedetails/1940-Buchanan-St-A-San-Francisco-CA-94115/15075413_zpid/
https://www.zillow.com/homedetails/2380-California-St-QZ6SFATJK-San-Francisco-CA-94115/2078197750_zpid/
https://www.zillow.com/homedetails/1883-Agnew-Rd-UNIT-241-Santa-Clara-CA-95054/79841436_zpid/
https://www.zillow.com/b/marina-playa-santa-clara-ca-5XjKBc/
https://www.zillow.com/b/birch-creek-mountain-view-ca-5XjKKB/
https://www.zillow.com/homedetails/969-Clark-Ave-D-Mountain-View-CA-94040/2068419946_zpid/
https://www.zillow.com/homedetails/74-Williams-St-San-Leandro-CA-94577/24879175_zpid/
The problem is the website. It adds the links dynamicaly, so you can try scrolling to the bottom of the page and than searching for the links.
bottomFooter = driver.find_element_by_id("region-info-footer")
driver.execute_script("arguments[0].scrollIntoView();", bottomFooter)

How to get href with BeautifulSoup

The Situation
I want to scrape from this website:
http://www.dpm.tn/dpm_pharm/medicament/listmedicparnomspec.php
My code:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import requests
from bs4 import BeautifulSoup
# agent
user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36"
# headless driver
options = webdriver.ChromeOptions()
options.headless = True
options.add_argument(f'user-agent={user_agent}')
options.add_argument("--window-size=1920,1080")
options.add_argument('--ignore-certificate-errors')
options.add_argument('--allow-running-insecure-content')
options.add_argument("--disable-extensions")
options.add_argument("--proxy-server='direct://'")
options.add_argument("--proxy-bypass-list=*")
options.add_argument("--start-maximized")
options.add_argument('--disable-gpu')
options.add_argument('--disable-dev-shm-usage')
options.add_argument('--no-sandbox')
driver = webdriver.Chrome(executable_path="D:\Downloads\chromedriver.exe", options=options)
# request test
medecine = 'doliprane'
# submiting a search
driver.get('http://www.dpm.tn/dpm_pharm/medicament/listmedicparnomspec.php')
e = driver.find_element_by_name('id')
e.send_keys(medecine)
e.submit()
# geting the result table
try:
table = driver.find_element_by_xpath('/html/body/table/tbody/tr/td/table/tbody')
print('succes')
except:
print('failed')
The code to get the link :
print('bs4 turn \n')
result = BeautifulSoup(table.get_attribute('innerHTML'), 'lxml')
rows = result.find_all('tr')
links = []
real_link = []
for row in rows:
links.append(row.find('a', href= True))
for each in links:
print(each['href'])
The Problem:
Whenever running this I always get this error:
'NoneType' object is not subscriptable
The question:
How can I get this and find the href attribute as required?
Instead of using selenium use the requests library and fetch data and parse it.
Code:
import re
import requests
from bs4 import BeautifulSoup
medecine = 'doliprane'
url = "http://www.dpm.tn/dpm_pharm/medicament/listmedicspec.php"
payload = {"id":medecine}
response = requests.post(url, data=payload)
parsedhtml=BeautifulSoup(response.content,"html.parser")
regex = re.compile('fiche.php.*')
atag=parsedhtml.findAll("a",{"href":regex})
links =[i['href'].replace("fiche.php","http://www.dpm.tn/dpm_pharm/medicament/fiche.php") for i in atag ]
print(links)
Let me know if you have any questions :)
When accessing it try with:
print('bs4 turn \n')
result = BeautifulSoup(table.get_attribute('innerHTML'), 'lxml')
rows = result.find_all('tr')
links = []
real_link = []
for row in rows:
a = row.find("a", href=True)
links.append(a['href'])
for each in links:
print(each)
I solved it but using selenium instead of Beautiful soup:
for i in range(2, max):
a_driver = driver.find_element_by_xpath(f'/html/body/table/tbody/tr/td/table/tbody/tr[{i}]/td[11]/a')
result2 = BeautifulSoup(a_driver.get_attribute('innerHTML'), 'lxml')
link = a_driver.get_attribute('href')
links.append(link)
for i in range(0, len(links)):
print(links[i])
this is worked for me

Scraping all results from page with BeautifulSoup

**Update**
===================================================
Ok guys, so far so good. I have code that allows me to scrape images, but it stores them in a strange way. It downloads first 40+ images, then creates another 'kittens' folder within previously created 'kittens' folder and starts over (downloading the same images as in first folder). How can I change it? Here is the code:
from selenium import webdriver
from selenium.webdriver import Chrome
from selenium.common.exceptions import WebDriverException
from bs4 import BeautifulSoup as soup
import requests
import time
import os
image_tags = []
driver = webdriver.Chrome()
driver.get(url='https://www.pexels.com/search/kittens/')
last_height = driver.execute_script('return document.body.scrollHeight')
while True:
driver.execute_script('window.scrollTo(0, document.body.scrollHeight);')
time.sleep(1)
new_height = driver.execute_script('return document.body.scrollHeight')
if new_height == last_height:
break
else:
last_height = new_height
sp = soup(driver.page_source, 'html.parser')
for img_tag in sp.find_all('img'):
image_tags.append(img_tag)
if not os.path.exists('kittens'):
os.makedirs('kittens')
os.chdir('kittens')
x = 0
for image in image_tags:
try:
url = image['src']
source = requests.get(url)
with open('kitten-{}.jpg'.format(x), 'wb') as f:
f.write(requests.get(url).content)
x += 1
except:
pass
===========================================================================
im trying to write a spider to scrape images of kittens from some page. I've got small problem, because my spider only gets first 15 images. I know it's probably because the page is loading more images after scrolling down. How can I resolve this issue?
Here is the code:
import requests
from bs4 import BeautifulSoup as bs
import os
url = 'https://www.pexels.com/search/cute%20kittens/'
page = requests.get(url)
soup = bs(page.text, 'html.parser')
image_tags = soup.findAll('img')
if not os.path.exists('kittens'):
os.makedirs('kittens')
os.chdir('kittens')
x = 0
for image in image_tags:
try:
url = image['src']
source = requests.get(url)
if source.status_code == 200:
with open('kitten-' + str(x) + '.jpg', 'wb') as f:
f.write(requests.get(url).content)
f.close()
x += 1
except:
pass
Since the site is dynamic, you need to use a browser manipulation tool such as selenium:
from selenium import webdriver
from bs4 import BeautifulSoup as soup
import time
import os
driver = webdriver.Chrome()
driver.get('https://www.pexels.com/search/cute%20kittens/')
last_height = driver.execute_script("return document.body.scrollHeight")
while True:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(0.5)
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
image_urls = [i['src'] for i in soup(driver.page_source, 'html.parser').find_all('img')]
if not os.path.exists('kittens'):
os.makedirs('kittens')
os.chdir('kittens')
with open('kittens.txt') as f:
for url in image_urls:
f.write('{}\n'.format(url))

Retrieving all information from page BeautifulSoup

I am attempting to scrape the urls of the products on an OldNavy webpage. However, it is only giving parts of the products list instead of the whole thing (for example, giving only 8 urls when there are way more than 8). I was hoping someone could help and identify what the problem may be.
from bs4 import BeautifulSoup
from selenium import webdriver
import html5lib
import platform
import urllib
import urllib2
import json
link = http://oldnavy.gap.com/browse/category.do?cid=1035712&sop=true
base_url = "http://www.oldnavy.com"
driver = webdriver.PhantomJS()
driver.get(link)
html = driver.page_source
soup = BeautifulSoup(html, "html5lib")
bigDiv = soup.findAll("div", class_="sp_sm spacing_small")
for div in bigDiv:
links = div.findAll("a")
for i in links:
j = j + 1
productUrl = base_url + i["href"]
print productUrl
This page uses JavaScript to load elements but it loads only when you scroll down page.
It is called "lazy loading".
You have to scroll page too.
from selenium import webdriver
from bs4 import BeautifulSoup
import time
link = "http://oldnavy.gap.com/browse/category.do?cid=1035712&sop=true"
base_url = "http://www.oldnavy.com"
driver = webdriver.PhantomJS()
driver.get(link)
# ---
# scrolling
lastHeight = driver.execute_script("return document.body.scrollHeight")
#print(lastHeight)
pause = 0.5
while True:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(pause)
newHeight = driver.execute_script("return document.body.scrollHeight")
if newHeight == lastHeight:
break
lastHeight = newHeight
#print(lastHeight)
# ---
html = driver.page_source
soup = BeautifulSoup(html, "html5lib")
#driver.find_element_by_class_name
divs = soup.find_all("div", class_="sp_sm spacing_small")
for div in divs:
links = div.find_all("a")
for link in links:
print base_url + link["href"]
Idea: https://stackoverflow.com/a/28928684/1832058

Categories