Instagram Scraper with Selenium - python

I keep getting this message
"Message: no such element: Unable to locate element: {"method":"css selector","selector":".MGdpg > button:nth-child(1)"}"
while i was scraping a 500 comments post on Instagram and end up only getting 10 comments. is there any problem with the code?
from selenium import webdriver
import time
import sys
driver = webdriver.Chrome()
driver.get(sys.argv[1])
time.sleep(3)
#if user not logined
try:
close_button = driver.find_element_by_class_name('xqRnw')
close_button.click()
except:
pass
try:
load_more_comment = driver.find_element_by_css_selector('.MGdpg > button:nth-child(1)')
print("Found {}".format(str(load_more_comment)))
i = 0
while load_more_comment.is_displayed() and i < int(sys.argv[2]):
load_more_comment.click()
time.sleep(2.5)
load_more_comment = driver.find_element_by_css_selector('.MGdpg > button:nth-child(1)')
print("Found {}".format(str(load_more_comment)))
i += 1
except Exception as e:
print(e)
pass
user_names = []
user_comments = []
comment = driver.find_elements_by_class_name('gElp9 ')
for c in comment:
container = c.find_element_by_class_name('C4VMK')
name = container.find_element_by_class_name('_6lAjh').text
content = container.find_element_by_tag_name('span').text
content = content.replace('\n', ' ').strip().rstrip()
user_names.append(name)
user_comments.append(content)
user_names.pop(0)
user_comments.pop(0)
import excel_exporter
excel_exporter.export(user_names, user_comments)
driver.close()
btw the code belongs to Agi Maulana. You can check his Github repo regarding to this
https://github.com/AgiMaulana/Instagram-Comments-Scraper

Related

stale element reference element is not attached to the page document when trying to scrape information from different links

I am using Selenium on Python and trying to move the cursor and click on a specific element. This works for the first link and the structure of the HTML is the same for the next link but I get a StaleElementReferenceException for the second link when accessing it through the same webdriver. Why does this happen and how do I fix it? Below is the code I am running. Thank you so much!
def getZest(url):
    zestlist = []
    yearlist = []
    driver.get(url)
    time.sleep(5)
    
    
    result = False;
    attempts = 0;
    while(attempts < 5):
        try:
            Home_Value = wait.until(EC.presence_of_element_located((By.XPATH, "//a[text()='Home value']")))
            action.move_to_element(Home_Value).click().perform()
    
            zestimate = driver.find_element_by_xpath('//*[#id="ds-home-values"]/div/div[3]/button')
            action.move_to_element(zestimate).perform()
            result = True
            break
        except exceptions.StaleElementReferenceException as e:
            print(e)
        attempts = attempts + 1
fivenums = ["https://www.zillow.com/homedetails/212-Haddrell-St-Mount-Pleasant-SC-29464/10922911_zpid/", "https://www.zillow.com/homedetails/20-Grove-St-Hicksville-NY-11801/31127407_zpid/"]
for num in fivenums:
    getZest(num)
I was able to get informations about the first and second link with the following code, without any error:
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
import selenium.webdriver.support.expected_conditions as EC
from selenium.webdriver.common.by import By
import time
from selenium.common import exceptions
u = 'https://www.oddsportal.com/moving-margins/'
driver = webdriver.Chrome(executable_path=r"chromedriver.exe")
driver.maximize_window()
def getZest(url):
zestlist = []
yearlist = []
driver.get(url)
time.sleep(5)
result = False
attempts = 0
action = webdriver.ActionChains(driver)
wait = WebDriverWait(driver, 300)
while(attempts < 5):
try:
Home_Value = wait.until(EC.presence_of_element_located((By.XPATH, "//a[text()='Home value']")))
action.move_to_element(Home_Value).click().perform()
zestimate = driver.find_element_by_xpath('//*[#id="ds-home-values"]/div/div[3]/button')
action.move_to_element(zestimate).perform()
result = True
break
except exceptions.StaleElementReferenceException as e:
print(e)
attempts = attempts + 1
fivenums = ["https://www.zillow.com/homedetails/212-Haddrell-St-Mount-Pleasant-SC-29464/10922911_zpid/", "https://www.zillow.com/homedetails/20-Grove-St-Hicksville-NY-11801/31127407_zpid/"]
for num in fivenums:
getZest(num)
In your code there it's not showed where some variables are instantiated so, maybe this is were the problem is located.
However when opening the first link, the website showed me the Google Captcha protection, so, I suppose you have some kind of authorization to scrape the informations with the permission of the owner.

Getting youtube channel name into comment

I have wanting to get the selenium script to comment on a video with a specific keyword.
But I want to make it say the channel name too, can someone please help me with that if possible thanks.
(I know the {}s should not be in there but it will give you an idea on where to put the channel name)
(the channel name would go between the {}s btw)
here is the code.
import time
import os
from bs4 import BeautifulSoup
from selenium import webdriver
def youtube_login(email,password):
op = webdriver.ChromeOptions()
op.binary_location = os.environ.get("GOOGLE_CHROME_BIN")
#op.add_argument('--headless')
op.add_argument('--disable-dev-shm-usage')
op.add_argument('--no-sandbox')
driver = webdriver.Chrome()
driver.get('https://accounts.google.com/ServiceLogin?hl=en&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fhl%3Den%26feature%3Dsign_in_button%26app%3Ddesktop%26action_handle_signin%3Dtrue%26next%3D%252F&uilel=3&passive=true&service=youtube#identifier')
driver.find_element_by_id('identifierId').send_keys(email)
driver.find_element_by_id('identifierNext').click()
time.sleep(3)
#WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, 'div#password input[name="password"]')))
driver.find_element_by_css_selector('div#password input[name="password"]').send_keys(password)
time.sleep(4)
driver.find_element_by_id('passwordNext').click()
return driver
def comment_page(driver,urls,comment):
if len( urls ) == 0:
print ('Youtube Comment Bot: Finished!')
return []
url = urls.pop()
driver.get(url)
print(url)
driver.implicitly_wait(1)
if not check_exists_by_xpath(driver,'//*[#id="movie_player"]'):
return comment_page(driver, urls, random_comment())
time.sleep(4)
driver.execute_script("window.scrollTo(0, 600);")
if not check_exists_by_xpath(driver,'//*[#id="simple-box"]/ytd-comment-simplebox-renderer'):
return comment_page(driver, urls, random_comment())
if check_exists_by_xpath(driver,'//*[#id="contents"]/ytd-message-renderer'):
return comment_page(driver, urls, random_comment())
WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.CSS_SELECTOR, "ytd-comments ytd-comment-simplebox-renderer")))
driver.find_element_by_css_selector("ytd-comments ytd-comment-simplebox-renderer div#placeholder-area").click()
driver.implicitly_wait(5)
driver.find_element_by_xpath('//*[#id="contenteditable-root"]').send_keys(comment)
driver.find_element_by_xpath('//*[#id="contenteditable-root"]').send_keys(Keys.CONTROL, Keys.ENTER)
post = WebDriverWait(driver, 15).until(
EC.element_to_be_clickable((By.CSS_SELECTOR,'ytd-comments ytd-comment-simplebox-renderer'))
)
post.click()
r = np.random.randint(2,5)
time.sleep(r)
return comment_page(driver, urls, random_comment())
def random_comment():
# You can edit these lines=======
messages = [
'sup {channel name here}, I loved this video lol cant wait to see more :D'
]
# ===============================
r = np.random.randint(0, len(messages))
return messages[r]
def check_exists_by_xpath(driver,xpath):
try:
driver.find_element_by_xpath(xpath)
except NoSuchElementException:
return False
return True
if __name__ == '__main__':
# You should edit these lines=======
email = 'Youremail#gmail.com'
password = 'Yourpassword'
# ==================================
urls = [
'https://www.youtube.com/watch?v=Szww2_VqEKs&t',
]
inp = open("url.txt","r")
for line in inp.readlines():
urls.append(line)
driver = youtube_login(email, password)
comment_page(driver,urls,random_comment())
Here is the locator to get the channel name:
//div[contains(#class, 'channel-name')]//a
All you have to do now is to getText and replace it into your string.

I'm making a bot that likes every post that aren't liked yet

The problem is that it dosen't like the posts.
I have tried difrend methods like tag name
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
def like_photo(self):
driver = self.driver
driver.get("https://www.instagram.com")
time.sleep(1)
for i in range(1, 4):
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(2)
# find all the heart links
hrefs = driver.find_elements_by_xpath("//span[#aria-label='Synes godt om']")
pic_hrefs = [elem.get_attribute('href') for elem in hrefs]
pic_hrefs = [href for href in pic_hrefs]
print(' Photos ' + str(len(pic_hrefs)))
for _ in pic_hrefs:
driver.get("https://www.instagram.com")
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
try:
like_button = lambda: driver.find_elements_by_xpath("//span[#aria-label='Synes godt om']")
like_button.click()
time.sleep(18)
except Exception as e:
time.sleep(1)
nameIG = InstagramBot(username, password)
nameIG.login()
nameIG.like_photo()
It dosent like any post the output is just: Photos 4
Process finished with exit code 0
exit code 0 means your code is running with no error. However, there's still a problem.
To see if there are actual errors in your code, change the exception actions.
except Exception as e:
print(e) # shows actual error
Try this:
like_buttons = driver.find_elements_by_xpath(some_xpath_to_buttons) # list of WebElements
for button in like_buttons:
button.click()
time.sleep(18)

python selenium break off when access all 'a' tags

I can access all 'a' tags using Python+selenium in "https://www.zillow.com/homes/recently_sold/Culver-City-CA/house,condo,apartment_duplex,townhouse_type/51617_rid/12m_days/globalrelevanceex_sort/34.044908,-118.348417,33.961088,-118.468924_rect/12_zm/",which means all 'house detail' hyperlinks.But it breaks off when I get into every page to crawl information,that's what troubles me.I want the code can crawl all information in 26 pages successfully.Thank you!
# coding : utf-8
from selenium import webdriver
import time
driver = webdriver.Chrome()
driver.maximize_window()
def crawlHouseDetailForInvoke():
try:
driver.find_element_by_class_name("collapsible-header").click()# price/tax history
time.sleep(5)
table = driver.find_element_by_xpath('//div[#id = "wrapper"]//div[#id = "detail-container-column"]//section[3]/div[#id = "tax-price-history"]/div[#id = "hdp-price-history"]/div/table')
print(table.text)
except Exception:
print("读取数据失败!")
def crawlRegion(url):
driver.get(url)
page_links = driver.find_elements_by_xpath('//div[#id = "search-pagination-wrapper"]/ol/li/a')
print("站内页面数为:%d" % len(page_links))
house_link_parent = driver.find_element_by_id('list-results')
house_links = house_link_parent.find_elements_by_xpath('//div[#id = "search-results"]/ul/li/article/div/a')
print("每页的房源信息数:%d" % len(house_links))
times = 0
for j in range(len(house_links)):
times = times + 1
if(times%9 == 0):
print("元素重加载完成...")
house_link_parent = driver.find_element_by_id('list-results')
house_links = house_link_parent.find_elements_by_xpath('//div[#id = "search-results"]/ul/li/article/div/a')
print("序号:%d" % j)
print("链接:%s" % house_links[j].get_attribute("href"))
house_links[j].click()
time.sleep(8)
crawlHouseDetailForInvoke()
driver.back()
if __name__ == "__main__":
regionUrl = "https://www.zillow.com/homes/recently_sold/Culver-City-CA/house,condo,apartment_duplex,townhouse_type/51617_rid/12m_days/globalrelevanceex_sort/34.05529,-118.33211,33.956531,-118.485919_rect/12_zm/"
print("crawler is started...")
crawlRegion(regionUrl)
driver.close()
driver.quit()

Stale exception web scraping selenium python

I am trying to grab information from tripadvisor. I sometimes get
Message: stale element reference: element is not attached to the page document
(Session info: chrome=47.0.2526.73)
(Driver info: chromedriver=2.20.353124 (035346203162d32c80f1dce587c8154a1efa0c3b),platform=Mac OS X 10.10.4 x86_64)
and then the element is just whatever I assign it to. How can I fix my code to handle the issue and then figure out a solution to it instead of re running the code?
def getElements(driver):
elements = []
for dd in driver.find_elements_by_xpath("//*[contains(#class, 'ui_button original')]"):
try:
if dd.text == "Book Now":
elements.append(dd)
except Exception as ee:
print ee
return elements
def getBookingPartner(driver, ibInfo):
data = []
i = 0
elements = []
time.sleep(2)
elements = getElements(driver)
elementCounter = 0
while(elements == [] or elementCounter >5):
elements = getElements(driver)
elementCounter+=1
print "Length of elements should be > 0 : " + str(len(elements))
for ii in ibInfo:
if ii[0] == "Yes":
driver.implicitly_wait(3)
bookingPartner = "Error"
print ii
driver.implicitly_wait(3)
try:
elements[i].click()
driver.implicitly_wait(3)
driver.switch_to_window(driver.window_handles[-1])
except Exception as ee:
try:
driver.refresh()
getElements(driver)[i].click()
time.sleep(1)
driver.switch_to_window(driver.window_handles[-1])
except Exception as ee:
print "Stale Exception...."
print ee
try:
driver.implicitly_wait(3)
driver.find_elements_by_xpath("//*[contains(#class, 'book_now')]")[1].click()
driver.implicitly_wait(1)
page = etree.HTML(driver.page_source)
bookingPartner = page.xpath("//div[contains(#class, 'custServiceMsg')]//text()")[0].split("will")[0].strip()
except:
try:
time.sleep(3)
driver.find_elements_by_xpath("//*[contains(#class, 'book_now')]")[1].click()
time.sleep(2)
page = etree.HTML(driver.page_source)
bookingPartner = page.xpath("//div[contains(#class, 'custServiceMsg')]//text()")[0].split("will")[0].strip()
except:
try:
bookingPartner = page.xpath("//div[contains(#class, 'custServiceMsg')]//text()")[1].split("will")[0].strip()
except Exception as ee:
bookingPartner = "Error"
print "error"
i+=1
if bookingPartner == "The remainder":
bookingPartner = page.xpath("//div[contains(#class, 'custServiceMsg')]//text()")[1].split("will")[0].strip()
if len(driver.window_handles) > 1:
driver.close()
driver.switch_to_window(driver.window_handles[0])
print bookingPartner
data.append([ii[0], ii[1], bookingPartner])
else:
data.append([ii[0], ii[1], "N/A"])
ii.extend(["N/A"])
print data
return data
A Stale Element Reference Exception occurs when an element:
Has been deleted
Is no longer attached to the DOM (as in your case)
Has changed
From the docs:
You should discard the current reference you hold and replace it, possibly by locating the element again once it is attached to the DOM.
i.e.: "Find" the element again.
You'll need to modify the code to catch this error for the appropriate step.
from selenium.common.exceptions import StaleElementReferenceException
elem = driver.find_element_by_xpath('something leaves dom')
# ... do other actions which change the page and then later...
try:
elem.click()
except StaleElementReferenceException:
elem = driver.find_element_by_xpath('something leaves dom')
elem.click()
Make a re-usable a version if you need it extensively for several elements.
Btw, you should not be catching Exception in your code. Be specific about which ones you want to handle.

Categories