I am trying to scrape news from the Skynews website which has a dynamic scrolling page (The link for the website I am trying to scrap: https://www.skynewsarabia.com/live-story/1345832-%D9%85%D8%B3%D8%AA%D8%AC%D8%AF%D8%A7%D8%AA-%D9%83%D9%88%D8%B1%D9%88%D9%86%D8%A7-%D8%A7%D9%94%D8%AD%D8%AF%D8%AB-%D8%A7%D9%94%D8%AE%D8%A8%D8%A7%D8%B1-%D8%A7%D9%84%D9%81%D9%8A%D8%B1%D9%88%D8%B3-%D8%A7%D9%84%D8%B9%D8%A7%D9%84%D9%85).
I am using Selenium to scroll down the page and beautifulsoup to scrape information.
The problem is my code is only returning the first five news and it does not perform the scrolling, However, the same code works well on other website and scrolling down the page.
Following is the code I am using to scroll the site and scrap info:
from selenium import webdriver
import time
url="https://www.skynewsarabia.com/live-story/1345832-%D9%85%D8%B3%D8%AA%D8%AC%D8%AF%D8%A7%D8%AA-%D9%83%D9%88%D8%B1%D9%88%D9%86%D8%A7-%D8%A7%D9%94%D8%AD%D8%AF%D8%AB-%D8%A7%D9%94%D8%AE%D8%A8%D8%A7%D8%B1-%D8%A7%D9%84%D9%81%D9%8A%D8%B1%D9%88%D8%B3-%D8%A7%D9%84%D8%B9%D8%A7%D9%84%D9%85"
driver = webdriver.Chrome(executable_path=r"C:\Users\Acoe\Documents\yaman\Scraping\chromedriver.exe")
driver.get('https://www.skynewsarabia.com/live-story/1345832-%D9%85%D8%B3%D8%AA%D8%AC%D8%AF%D8%A7%D8%AA-%D9%83%D9%88%D8%B1%D9%88%D9%86%D8%A7-%D8%A7%D9%94%D8%AD%D8%AF%D8%AB-%D8%A7%D9%94%D8%AE%D8%A8%D8%A7%D8%B1-%D8%A7%D9%84%D9%81%D9%8A%D8%B1%D9%88%D8%B3-%D8%A7%D9%84%D8%B9%D8%A7%D9%84%D9%85')
time. sleep(3)
previous_height = driver.execute_script('return document.body.scrollHeight')
while True:
#### the below code to scroll down the web page
driver.execute_script('window.scrollTo(0, document.body.scrollHeight);')
time.sleep(3)
new_height= driver.execute_script('return document.body.scrollHeight')
# we use "+ str(i+1)" to loop over all we pages in this address because its
# the same page address including page number in the end
## Start the scraping
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
headlines = soup.find('body').find_all('h2')
for x in headlines:
if len(x.text.strip()) > 20:
fake_news.append(x.text.strip())
print(x.text.strip())
else:
pass
if new_height == previous_height:
break
previous_height = new_height
Anyone have any clue why the code could not scroll down this website specifically since the code is doing well for any other website with the same properties? (ex: https://www.reddit.com/search/?q=covid19)
The problem of the page is that it will not load new articles when the scroll is 100% to the bottom. You can solve this by incrementing the scroll coordinates by a little every time, instead of going to the bottom of the page.
scrolled = 500
while True:
driver.execute_script(f'window.scrollTo(0, {scrolled});')
scrolled += 500
time.sleep(1)
I understand the key concepts to scraping YouTube is to carry out the following automated actions using Selenium, Webdriver and Python:
Launch Chrome browser (or firefox)
Authenticate YouTube "I Agree" button
Supply a YouTube URL
Determine the scroll height of the page
Scrape video content
I have created the script below that successfully launches the Chrome browser, authenticates the "I Agree" button, launches the YouTube URL and scrolls down the page whilst carrying out the Title, Views and Posted Date scraping in the background.
Issue
In my test whilst scraping a YouTube VIDEOS Page with 108 videos, it only captures the first 30 videos.
Behind the scenes, it cycles through the script identifying 4 different scroll heights 2264, 3812, 5576 & 6380 respectfully.
However each scroll height duplicates the original capture (first 30 videos).
Thus, I get 30 unique video captures (titles, views and posted dates) presented 4 times with a total video count being 120. If all was working well I should get 108 unique video captures.
I have read many articles on Stackoverflow, but cannot find any reference material to address this issue. Any help would be much appreciated.
Python Script
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
import pandas as pd
import time
# Row & Column Viewing Options
pd.set_option("max_rows", None)
pd.set_option("max_colwidth", None)
# Removes SSL Issues With Chrome
options = webdriver.ChromeOptions()
options.add_argument('--ignore-certificate-errors-spki-list')
options.add_argument('log-level=3')
options.add_argument('--disable-notifications')
# Get URL
url = 'https://www.youtube.com/c/JohnWatsonRooney/videos?view=0&sort=dd&flow=grid'
driver = webdriver.Chrome(options=options)
driver.get(url)
# Auto Consent Youtube
consent_button_xpath = '//*[#id="yDmH0d"]/c-wiz/div/div/div/div[2]/div[1]/div[4]/form/div[1]/div/button/span'
consent = WebDriverWait(driver, 40).until(EC.element_to_be_clickable((By.XPATH, consent_button_xpath)))
consent = driver.find_element_by_xpath(consent_button_xpath)
consent.click()
# Get scroll height
current_scroll_height = driver.execute_script("return document.documentElement.scrollHeight")
print('> Current Scroll Height (before loop):', current_scroll_height) #starts at 2264
# Locate individual YouTube Video
videos = driver.find_elements_by_class_name('style-scope ytd-grid-video-renderer')
video_list = [] # Create array for video list (video_item > video > videos)
# Loop: YouTube page load to end of scroll
while True:
# Scroll to bottom of page
driver.execute_script("window.scrollTo(0, arguments[0]);", current_scroll_height)
# Wait to load page
time.sleep(1)
print(' - Current Scroll Height (while loop):', current_scroll_height)
# Extract title, views & date posted from individual youtube video
for video in videos:
title = video.find_element_by_xpath('.//*[#id="video-title"]').text
views = video.find_element_by_xpath('.//*[#id="metadata-line"]/span[1]').text
post_date = video.find_element_by_xpath('.//*[#id="metadata-line"]/span[2]').text
# Store individual YouTube video information within dictionary
vid_item = {
'title': title,
'views': views,
'posted': post_date
}
# Store individual videos in main array video_list
video_list.append(vid_item)
# Use pandas to present main array containing individual videos + number of videos in array
df = pd.DataFrame(video_list)
# Calculate new scroll height and compare with current scroll height
new_height = driver.execute_script("return document.documentElement.scrollHeight")
print(' - New Scroll Height (after for loop):', new_height)
if new_height == current_scroll_height:
print(df)
print('\nNumber of Videos (end of for loop):', len(video_list))
break
current_scroll_height = new_height
driver.quit()
I am trying to write a script to automate job applications on Linkedin using selenium and python.
The steps are simple:
open the LinkedIn page, enter id password and log in
open https://linkedin.com/jobs and enter the search keyword and location and click search(directly opening links like https://www.linkedin.com/jobs/search/?geoId=101452733&keywords=python&location=Australia get stuck as loading, probably due to lack of some post information from the previous page)
the click opens the job search page but this doesn't seem to update the driver as it still searches on the previous page.
import selenium
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from bs4 import BeautifulSoup
import pandas as pd
import yaml
driver = webdriver.Chrome("/usr/lib/chromium-browser/chromedriver")
url = "https://linkedin.com/"
driver.get(url)
content = driver.page_source
stream = open("details.yaml", 'r')
details = yaml.safe_load(stream)
def login():
username = driver.find_element_by_id("session_key")
password = driver.find_element_by_id("session_password")
username.send_keys(details["login_details"]["id"])
password.send_keys(details["login_details"]["password"])
driver.find_element_by_class_name("sign-in-form__submit-button").click()
def get_experience():
return "1%C22"
login()
jobs_url = f'https://www.linkedin.com/jobs/'
driver.get(jobs_url)
keyword = driver.find_element_by_xpath("//input[starts-with(#id, 'jobs-search-box-keyword-id-ember')]")
location = driver.find_element_by_xpath("//input[starts-with(#id, 'jobs-search-box-location-id-ember')]")
keyword.send_keys("python")
location.send_keys("Australia")
driver.find_element_by_xpath("//button[normalize-space()='Search']").click()
WebDriverWait(driver, 10)
# content = driver.page_source
# soup = BeautifulSoup(content)
# with open("a.html", 'w') as a:
# a.write(str(soup))
print(driver.current_url)
driver.current_url returns https://linkedin.com/jobs/ instead of https://www.linkedin.com/jobs/search/?geoId=101452733&keywords=python&location=Australia as it should. I have tried to print the content to a file, it is indeed from the previous jobs page and not from the search page. I have also tried to search elements from page like experience and easy apply button but the search results in a not found error.
I am not sure why this isn't working.
Any ideas? Thanks in Advance
UPDATE
It works if try to directly open something like https://www.linkedin.com/jobs/search/?f_AL=True&f_E=2&keywords=python&location=Australia but not https://www.linkedin.com/jobs/search/?f_AL=True&f_E=1%2C2&keywords=python&location=Australia
the difference in both these links is that one of them takes only one value for experience level while the other one takes two values. This means it's probably not a post values issue.
You are getting and printing the current URL immediately after clicking on the search button, before the page changed with the response received from the server.
This is why it outputs you with https://linkedin.com/jobs/ instead of something like https://www.linkedin.com/jobs/search/?geoId=101452733&keywords=python&location=Australia.
WebDriverWait(driver, 10) or wait = WebDriverWait(driver, 20) will not cause any kind of delay like time.sleep(10) does.
wait = WebDriverWait(driver, 20) only instantiates a wait object, instance of WebDriverWait module / class
I want to scrape, e.g., the title of the first 200 questions under the web page https://www.quora.com/topic/Stack-Overflow-4/all_questions. And I tried the following code:
import requests
from bs4 import BeautifulSoup
url = "https://www.quora.com/topic/Stack-Overflow-4/all_questions"
print("url")
print(url)
r = requests.get(url) # HTTP request
print("r")
print(r)
html_doc = r.text # Extracts the html
print("html_doc")
print(html_doc)
soup = BeautifulSoup(html_doc, 'lxml') # Create a BeautifulSoup object
print("soup")
print(soup)
It gave me a text https://pastebin.com/9dSPzAyX. If we search href='/, we can see that the html does contain title of some questions. However, the problem is that the number is not enough; actually on the web page, a user needs to manually scroll down to trigger extra load.
Does anyone know how I could mimic "scrolling down" by the program to load more content of the page?
Infinite scrolls on a webpage is based on the Javascript functionality. Therefore, to find out what URL we need to access and what parameters to use, we need to either thoroughly study the JS code working inside the page or, and preferably, examine the requests that the browser does when you scroll down the page. We can study requests using the Developer Tools.
See example for quora
the more you scroll down, the more requests generated. so now your requests will be done to that url instead of normal url but keep in mind to send correct headers and playload.
other easier solution will be by using selenium
Couldn't find a response using request. But you can use Selenium. First printed out the number of questions at first load, then send the End key to mimic scrolling down. You can see number of questions went from 20 to 40 after sending the End key.
I used driver.implicitly wait for 5 seconds before loading the DOM again in case the script load to fast before the DOM was loaded. You can improve by using EC with selenium.
The page loads 20 questions per scroll. So if you are looking to scrape 100 questions, then you need to send the End key 5 times.
To use the code below you need to install chromedriver.
http://chromedriver.chromium.org/downloads
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
CHROMEDRIVER_PATH = ""
CHROME_PATH = ""
WINDOW_SIZE = "1920,1080"
chrome_options = Options()
# chrome_options.add_argument("--headless")
chrome_options.add_argument("--window-size=%s" % WINDOW_SIZE)
chrome_options.binary_location = CHROME_PATH
prefs = {'profile.managed_default_content_settings.images':2}
chrome_options.add_experimental_option("prefs", prefs)
url = "https://www.quora.com/topic/Stack-Overflow-4/all_questions"
def scrape(url, times):
if not url.startswith('http'):
raise Exception('URLs need to start with "http"')
driver = webdriver.Chrome(
executable_path=CHROMEDRIVER_PATH,
chrome_options=chrome_options
)
driver.get(url)
counter = 1
while counter <= times:
q_list = driver.find_element_by_class_name('TopicAllQuestionsList')
questions = [x for x in q_list.find_elements_by_xpath('//div[#class="pagedlist_item"]')]
q_len = len(questions)
print(q_len)
html = driver.find_element_by_tag_name('html')
html.send_keys(Keys.END)
wait = WebDriverWait(driver, 5)
time.sleep(5)
questions2 = [x for x in q_list.find_elements_by_xpath('//div[#class="pagedlist_item"]')]
print(len(questions2))
counter += 1
driver.close()
if __name__ == '__main__':
scrape(url, 5)
I recommend using selenium rather than bs.
selenium can control browser and parsing. like scroll down, click button, etc…
this example is for scroll down for get all liker user in instagram.
https://stackoverflow.com/a/54882356/5611675
If the content only loads on "scrolling down", this probably means that the page is using Javascript to dynamically load the content.
You can try using a web client such as PhantomJS to load the page and execute the javascript in it, and simulate the scroll by injecting some JS such as document.body.scrollTop = sY; (Simulate scroll event using Javascript).
Can anyone tell me how to access the underlying URL to view a given user's Instagram followers? I am able to do this with Instagram API, but given the pending changes to the approval process, I have decided to switch to scraping.
The Instagram web browser allows you to view the follower list for any given public user - for example, to view Instagram's followers, visit "https://www.instagram.com/instagram", and then click on the followers URL to open a window that paginates through viewers (note: you must be logged in to your account to view this).
I note that the URL changes to "https://www.instagram.com/instagram/followers" when this window pops up, but I can't seem to view the underlying page source for this URL.
Since it appears on my browser window, I assume that I will be able to scrape. But do I have to use a package like Selenium? Does anyone know what the underlying URL is, so I don't have to use Selenium?
As an example, I am able to directly access the underlying feed data by visiting "instagram.com/instagram/media/", from which I can scrape and paginate through all iterations. I would like to do something similar with the list of followers, and access this data directly (rather than using Selenium).
EDIT: Dec 2018 Update:
Things have changed in Insta land since this was posted. Here is an updated script that is a bit more pythonic and better utilizes XPATH/CSS paths.
Note that to use this updated script, you must install the explicit package (pip install explicit), or convert each line with waiter to a pure selenium explicit wait.
import itertools
from explicit import waiter, XPATH
from selenium import webdriver
def login(driver):
username = "" # <username here>
password = "" # <password here>
# Load page
driver.get("https://www.instagram.com/accounts/login/")
# Login
waiter.find_write(driver, "//div/input[#name='username']", username, by=XPATH)
waiter.find_write(driver, "//div/input[#name='password']", password, by=XPATH)
waiter.find_element(driver, "//div/button[#type='submit']", by=XPATH).click()
# Wait for the user dashboard page to load
waiter.find_element(driver, "//a/span[#aria-label='Find People']", by=XPATH)
def scrape_followers(driver, account):
# Load account page
driver.get("https://www.instagram.com/{0}/".format(account))
# Click the 'Follower(s)' link
# driver.find_element_by_partial_link_text("follower").click()
waiter.find_element(driver, "//a[#href='/instagram/followers/']", by=XPATH).click()
# Wait for the followers modal to load
waiter.find_element(driver, "//div[#role='dialog']", by=XPATH)
# At this point a Followers modal pops open. If you immediately scroll to the bottom,
# you hit a stopping point and a "See All Suggestions" link. If you fiddle with the
# model by scrolling up and down, you can force it to load additional followers for
# that person.
# Now the modal will begin loading followers every time you scroll to the bottom.
# Keep scrolling in a loop until you've hit the desired number of followers.
# In this instance, I'm using a generator to return followers one-by-one
follower_css = "ul div li:nth-child({}) a.notranslate" # Taking advange of CSS's nth-child functionality
for group in itertools.count(start=1, step=12):
for follower_index in range(group, group + 12):
yield waiter.find_element(driver, follower_css.format(follower_index)).text
# Instagram loads followers 12 at a time. Find the last follower element
# and scroll it into view, forcing instagram to load another 12
# Even though we just found this elem in the previous for loop, there can
# potentially be large amount of time between that call and this one,
# and the element might have gone stale. Lets just re-acquire it to avoid
# that
last_follower = waiter.find_element(driver, follower_css.format(follower_index))
driver.execute_script("arguments[0].scrollIntoView();", last_follower)
if __name__ == "__main__":
account = 'instagram'
driver = webdriver.Chrome()
try:
login(driver)
# Print the first 75 followers for the "instagram" account
print('Followers of the "{}" account'.format(account))
for count, follower in enumerate(scrape_followers(driver, account=account), 1):
print("\t{:>3}: {}".format(count, follower))
if count >= 75:
break
finally:
driver.quit()
I did a quick benchmark to show how performance decreases exponentially the more followers you attempt to scrape this way:
$ python example.py
Followers of the "instagram" account
Found 100 followers in 11 seconds
Found 200 followers in 19 seconds
Found 300 followers in 29 seconds
Found 400 followers in 47 seconds
Found 500 followers in 71 seconds
Found 600 followers in 106 seconds
Found 700 followers in 157 seconds
Found 800 followers in 213 seconds
Found 900 followers in 284 seconds
Found 1000 followers in 375 seconds
Original post:
Your question is a little confusing. For instance, I'm not really sure what "from which I can scrape and paginate through all iterations" actually means. What are you currently using to scrape and paginate?
Regardless, instagram.com/instagram/media/ is not the same type of endpoint as instagram.com/instagram/followers. The media endpoint appears to be a REST API, configured to return an easily parseable JSON object.
The followers endpoint isn't really a RESTful endpoint from what I can tell. Rather, Instagram AJAXs in the information to the page source (using React?) after you click the Followers button. I don't think you will be able to get that information without using something like Selenium, which can load/render the javascript that displays the followers to the user.
This example code will work:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
def login(driver):
username = "" # <username here>
password = "" # <password here>
# Load page
driver.get("https://www.instagram.com/accounts/login/")
# Login
driver.find_element_by_xpath("//div/input[#name='username']").send_keys(username)
driver.find_element_by_xpath("//div/input[#name='password']").send_keys(password)
driver.find_element_by_xpath("//span/button").click()
# Wait for the login page to load
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.LINK_TEXT, "See All")))
def scrape_followers(driver, account):
# Load account page
driver.get("https://www.instagram.com/{0}/".format(account))
# Click the 'Follower(s)' link
driver.find_element_by_partial_link_text("follower").click()
# Wait for the followers modal to load
xpath = "//div[#style='position: relative; z-index: 1;']/div/div[2]/div/div[1]"
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.XPATH, xpath)))
# You'll need to figure out some scrolling magic here. Something that can
# scroll to the bottom of the followers modal, and know when its reached
# the bottom. This is pretty impractical for people with a lot of followers
# Finally, scrape the followers
xpath = "//div[#style='position: relative; z-index: 1;']//ul/li/div/div/div/div/a"
followers_elems = driver.find_elements_by_xpath(xpath)
return [e.text for e in followers_elems]
if __name__ == "__main__":
driver = webdriver.Chrome()
try:
login(driver)
followers = scrape_followers(driver, "instagram")
print(followers)
finally:
driver.quit()
This approach is problematic for a number of reasons, chief among them being how slow it is, relative to the the API.
Update: March 2020
This is just the Levi answer with a small updates in some parts because as it is now, it didn't quit the driver successfully. This also gets by default all the followers, as everyone else have said, it's not intended for a lot of followers.
import itertools
from explicit import waiter, XPATH
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from time import sleep
def login(driver):
username = "" # <username here>
password = "" # <password here>
# Load page
driver.get("https://www.instagram.com/accounts/login/")
sleep(3)
# Login
driver.find_element_by_name("username").send_keys(username)
driver.find_element_by_name("password").send_keys(password)
submit = driver.find_element_by_tag_name('form')
submit.submit()
# Wait for the user dashboard page to load
WebDriverWait(driver, 15).until(
EC.presence_of_element_located((By.LINK_TEXT, "See All")))
def scrape_followers(driver, account):
# Load account page
driver.get("https://www.instagram.com/{0}/".format(account))
# Click the 'Follower(s)' link
# driver.find_element_by_partial_link_text("follower").click
sleep(2)
driver.find_element_by_partial_link_text("follower").click()
# Wait for the followers modal to load
waiter.find_element(driver, "//div[#role='dialog']", by=XPATH)
allfoll = int(driver.find_element_by_xpath("//li[2]/a/span").text)
# At this point a Followers modal pops open. If you immediately scroll to the bottom,
# you hit a stopping point and a "See All Suggestions" link. If you fiddle with the
# model by scrolling up and down, you can force it to load additional followers for
# that person.
# Now the modal will begin loading followers every time you scroll to the bottom.
# Keep scrolling in a loop until you've hit the desired number of followers.
# In this instance, I'm using a generator to return followers one-by-one
follower_css = "ul div li:nth-child({}) a.notranslate" # Taking advange of CSS's nth-child functionality
for group in itertools.count(start=1, step=12):
for follower_index in range(group, group + 12):
if follower_index > allfoll:
raise StopIteration
yield waiter.find_element(driver, follower_css.format(follower_index)).text
# Instagram loads followers 12 at a time. Find the last follower element
# and scroll it into view, forcing instagram to load another 12
# Even though we just found this elem in the previous for loop, there can
# potentially be large amount of time between that call and this one,
# and the element might have gone stale. Lets just re-acquire it to avoid
# tha
last_follower = waiter.find_element(driver, follower_css.format(group+11))
driver.execute_script("arguments[0].scrollIntoView();", last_follower)
if __name__ == "__main__":
account = "" # <account to check>
driver = webdriver.Firefox(executable_path="./geckodriver")
try:
login(driver)
print('Followers of the "{}" account'.format(account))
for count, follower in enumerate(scrape_followers(driver, account=account), 1):
print("\t{:>3}: {}".format(count, follower))
finally:
driver.quit()
I noticed that the previous answer no longer works, so I made an updated version based on the previous answer, which includes the scrolling feature (to get all the users in the list, not just those loaded initially). In addition, this scrapes both followers and following. (You'll need to download chromedriver as well)
import time
from selenium import webdriver as wd
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
# The account you want to check
account = ""
# Chrome executable
chrome_binary = r"chrome.exe" # Add your path here
def login(driver):
username = "" # Your username
password = "" # Your password
# Load page
driver.get("https://www.instagram.com/accounts/login/")
# Login
driver.find_element_by_xpath("//div/input[#name='username']").send_keys(username)
driver.find_element_by_xpath("//div/input[#name='password']").send_keys(password)
driver.find_element_by_xpath("//span/button").click()
# Wait for the login page to load
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.LINK_TEXT, "See All")))
def scrape_followers(driver, account):
# Load account page
driver.get("https://www.instagram.com/{0}/".format(account))
# Click the 'Follower(s)' link
driver.find_element_by_partial_link_text("follower").click()
# Wait for the followers modal to load
xpath = "/html/body/div[4]/div/div/div[2]/div/div[2]"
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.XPATH, xpath)))
SCROLL_PAUSE = 0.5 # Pause to allow loading of content
driver.execute_script("followersbox = document.getElementsByClassName('_gs38e')[0];")
last_height = driver.execute_script("return followersbox.scrollHeight;")
# We need to scroll the followers modal to ensure that all followers are loaded
while True:
driver.execute_script("followersbox.scrollTo(0, followersbox.scrollHeight);")
# Wait for page to load
time.sleep(SCROLL_PAUSE)
# Calculate new scrollHeight and compare with the previous
new_height = driver.execute_script("return followersbox.scrollHeight;")
if new_height == last_height:
break
last_height = new_height
# Finally, scrape the followers
xpath = "/html/body/div[4]/div/div/div[2]/div/div[2]/ul/li"
followers_elems = driver.find_elements_by_xpath(xpath)
followers_temp = [e.text for e in followers_elems] # List of followers (username, full name, follow text)
followers = [] # List of followers (usernames only)
# Go through each entry in the list, append the username to the followers list
for i in followers_temp:
username, sep, name = i.partition('\n')
followers.append(username)
print("______________________________________")
print("FOLLOWERS")
return followers
def scrape_following(driver, account):
# Load account page
driver.get("https://www.instagram.com/{0}/".format(account))
# Click the 'Following' link
driver.find_element_by_partial_link_text("following").click()
# Wait for the following modal to load
xpath = "/html/body/div[4]/div/div/div[2]/div/div[2]"
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.XPATH, xpath)))
SCROLL_PAUSE = 0.5 # Pause to allow loading of content
driver.execute_script("followingbox = document.getElementsByClassName('_gs38e')[0];")
last_height = driver.execute_script("return followingbox.scrollHeight;")
# We need to scroll the following modal to ensure that all following are loaded
while True:
driver.execute_script("followingbox.scrollTo(0, followingbox.scrollHeight);")
# Wait for page to load
time.sleep(SCROLL_PAUSE)
# Calculate new scrollHeight and compare with the previous
new_height = driver.execute_script("return followingbox.scrollHeight;")
if new_height == last_height:
break
last_height = new_height
# Finally, scrape the following
xpath = "/html/body/div[4]/div/div/div[2]/div/div[2]/ul/li"
following_elems = driver.find_elements_by_xpath(xpath)
following_temp = [e.text for e in following_elems] # List of following (username, full name, follow text)
following = [] # List of following (usernames only)
# Go through each entry in the list, append the username to the following list
for i in following_temp:
username, sep, name = i.partition('\n')
following.append(username)
print("\n______________________________________")
print("FOLLOWING")
return following
if __name__ == "__main__":
options = wd.ChromeOptions()
options.binary_location = chrome_binary # chrome.exe
driver_binary = r"chromedriver.exe"
driver = wd.Chrome(driver_binary, chrome_options=options)
try:
login(driver)
followers = scrape_followers(driver, account)
print(followers)
following = scrape_following(driver, account)
print(following)
finally:
driver.quit()