Selenium Webdriver could not get some of the contents - python

https://www.forrent.com/apartment-community-profile/1012635
I am trying to parse a web page, such as this one. Selenium could return some of the content of this page, but not all of them. For example "Professionally Managed by: B & A Associates" is in the web page, but its not returned by the variable 'content' in the script. Any idea why is that, how to solve this problem?
driver = webdriver.Firefox(executable_path='/home/yliu/repos/funnel_objects/listing_sites/geckodriver')
try:
driver.set_page_load_timeout(20)
driver.get(url)
#WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.ID, "contactHeading")))
WebDriverWait(driver, 40)
html = driver.page_source
content = BeautifulSoup(html,"lxml")
driver.quit()
return content
except TimeoutException:
print('time out from contact')
return None

That content is a lazy load component. It will be displayed once you have scrolled down. So you need a script to scroll down to the bottom. See code below.
driver = webdriver.Firefox(executable_path='/home/yliu/repos/funnel_objects/listing_sites/geckodriver')
try:
driver.set_page_load_timeout(20)
driver.get(url)
#WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.ID, "contactHeading")))
#WebDriverWait(driver, 40)
SCROLL_PAUSE_TIME = 0.5
SCROLL_LENGTH = 200
page_height = int(driver.execute_script("return document.body.scrollHeight"))
scrollPosition = 0
while scrollPosition < page_height:
scrollPosition = scrollPosition + SCROLL_LENGTH
driver.execute_script("window.scrollTo(0, " + str(scrollPosition) + ");")
time.sleep(SCROLL_PAUSE_TIME)
html = driver.page_source
content = BeautifulSoup(html,"lxml")
driver.quit()
return content
except TimeoutException:
print('time out from contact')
return None

Related

AttributeError: 'NoneType' object has no attribute 'suppress' using selenium webdriver

I am trying to scrape some links from https://www.mckinsey.com/capabilities/operations/our-insights using selenium with python.
from selenium.webdriver.common.by import By
from selenium import webdriver
from bs4 import BeautifulSoup
import time
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("--disable-notifications")
# browser = webdriver.Chrome('C:\\chromedriver.exe', options=chrome_options)
browser = webdriver.Firefox()
url = "https://www.mckinsey.com/capabilities/operations/our-insights"
browser.get(url)
time.sleep(5)
try:
accept = browser.find_element(By.XPATH, '//*[#id="onetrust-accept-btn-handler"]')
accept.click()
time.sleep(2)
browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
except:
pass
n = 1
while n < 3:
try:
browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
button = browser.find_element(By.XPATH, '//*[#id="skipToMain"]/div[2]/section[11]/div[2]/a')
button.click()
time.sleep(2)
browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
print('page', n)
n = n + 1
except:
print('page ended at', n)
break
source = browser.execute_script("return document.body.innerHTML")
time.sleep(5)
soup = BeautifulSoup(source, 'lxml')
Running above code gave the following error.
Exception ignored in: <function Service.__del__ at 0x000002AE1979DAF0>
Traceback (most recent call last):
File "C:\ProgramData\Anaconda3\lib\site-packages\selenium\webdriver\common\service.py", line 177, in __del__
AttributeError: 'NoneType' object has no attribute 'suppress'
I tried both Chrome and Firefox. Both of them are giving the same error.
Python version - 3.9
You can try the next example selenium with bs4
url='https://www.mckinsey.com/capabilities/operations/our-insights'
driver.get(url)
driver.maximize_window()
time.sleep(3)
accept = driver.find_element(By.XPATH, '//*[#id="onetrust-accept-btn-handler"]')
accept.click()
time.sleep(2)
data = []
for x in range(3):
try:
soup = BeautifulSoup(driver.page_source, 'lxml')
links = soup.select('[class="block-list text-s"]>div')
print(len(links))
for x in links:
link = x.a
link = 'https://www.mckinsey.com' + link.get('href') if link else None
data.append(link)
loadMoreButton = driver.find_element(By.XPATH, "//a[contains(text(),'View more')]")
if loadMoreButton:
driver.execute_script("arguments[0].click();" ,loadMoreButton)
#loadMoreButton.click()
time.sleep(3)
except Exception as e:
print(e)
break
print(set(data))
# df = pd.DataFrame(set(data))
# print(df)
Output:
{'https://www.mckinsey.com/capabilities/operations/our-insights/global-infrastructure-initiative/voices/v
oices-introduction-october-2022', 'https://www.mckinsey.com/capabilities/people-and-organizational-perfor
mance/our-insights/leading-operating-model-modernization-what-do-transformation-leaders-say', 'https://ww
w.mckinsey.com/capabilities/operations/our-insights/sustainable-spaces-countering-climate-risk-in-capital
-projects', 'https://www.mckinsey.com/capabilities/operations/our-insights/harnessing-volatility-technolo
gy-transformation-in-oil-and-gas', 'https://www.mckinsey.com/capabilities/operations/our-insights/buildin
g-supply-chain-resilience', 'https://www.mckinsey.com/capabilities/operations/our-insights/industrial-res
ource-productivity-and-the-road-to-sustainability', 'https://www.mckinsey.com/capabilities/operations/our
-insights/outsprinting-the-energy-crisis', 'https://www.mckinsey.com/capabilities/operations/our-insights
/emerging-from-disruption-the-future-of-pharma-operations-strategy', 'https://www.mckinsey.com/industries
/public-and-social-sector/our-insights/using-advanced-analytics-to-improve-performance-in-customs-agencie
s', 'https://www.mckinsey.com/industries/life-sciences/our-insights/against-the-odds-how-life-sciences-co
mpanies-excel-in-large-transformations', 'https://www.mckinsey.com/capabilities/operations/our-insights/t
he-hidden-value-of-voice-conversations-part-1-trends-and-technologies', 'https://www.mckinsey.com/feature
d-insights/mckinsey-on-books/the-titanium-economy', 'https://www.mckinsey.com/capabilities/operations/our
-insights/smart-scheduling-how-to-solve-workforce-planning-challenges-with-ai', 'https://www.mckinsey.com
/capabilities/operations/our-insights/generative-scheduling-saving-time-and-money-in-capital-projects', '
https://www.mckinsey.com/capabilities/operations/our-insights/global-infrastructure-initiative/voices/on-
the-path-to-net-zero-steel-in-building-and-construction', 'https://www.mckinsey.com/capabilities/operatio
ns/our-insights/global-infrastructure-initiative/voices/disrupting-transport-an-interview-with-robert-fal
ck-of-einride', 'https://www.mckinsey.com/capabilities/operations/our-insights/the-industrial-revolution-
in-services', 'https://www.mckinsey.com/capabilities/operations/our-insights/how-mining-companies-reach-t
he-operational-excellence-gold-standard', 'https://www.mckinsey.com/capabilities/operations/our-insights/
a-more-resilient-supply-chain-from-optimized-operations-planning', 'https://www.mckinsey.com/capabilities
/operations/our-insights/full-potential-procurement-lessons-amid-inflation-and-volatility', 'https://www.
mckinsey.com/capabilities/operations/our-insights/taking-the-pulse-of-shifting-supply-chains', 'https://w
ww.mckinsey.com/capabilities/operations/our-insights/global-infrastructure-initiative/voices/news-from-th
e-global-infrastructure-initiative-august-2022', 'https://www.mckinsey.com/capabilities/operations/our-in
sights/delivering-the-us-manufacturing-renaissance', 'https://www.mckinsey.com/capabilities/operations/ou
r-insights/building-sustainability-into-operations', 'https://www.mckinsey.com/capabilities/operations/ou
r-insights/global-infrastructure-initiative/voices/news-from-the-global-infrastructure-initiative-october
-2022', 'https://www.mckinsey.com/capabilities/operations/our-insights/global-infrastructure-initiative/v
oices/dhl-on-sustainable-customer-centric-delivery-in-the-last-mile', 'https://www.mckinsey.com/industrie
s/advanced-electronics/our-insights/sustainability-in-packaging-five-key-levers-for-significant-impact',
'https://www.mckinsey.com/capabilities/operations/our-insights/global-infrastructure-initiative/voices/do
ing-good-demands-doing-better-delivering-net-zero-capital-projects', 'https://www.mckinsey.com/industries
/life-sciences/our-insights/reimagining-the-future-of-biopharma-manufacturing', 'https://www.mckinsey.com
/capabilities/operations/our-insights/coca-cola-the-people-first-story-of-a-digital-transformation', 'htt
ps://www.mckinsey.com/capabilities/operations/our-insights/is-your-manufacturing-network-an-anchor-or-a-s
ail', 'https://www.mckinsey.com/industries/semiconductors/our-insights/rapid-throughput-improvement-at-ma
ture-semiconductor-fabs', 'https://www.mckinsey.com/capabilities/operations/our-insights/global-infrastru
cture-initiative/voices/reducing-embodied-carbon-in-new-construction', 'https://www.mckinsey.com/industri
es/healthcare-systems-and-services/our-insights/optimizing-health-system-supply-chain-performance', 'http
s://www.mckinsey.com/capabilities/operations/our-insights/global-infrastructure-initiative/voices/managin
g-capital-risk-in-the-race-to-net-zero', 'https://www.mckinsey.com/capabilities/operations/our-insights/a
ccelerating-green-growth-in-the-built-environment', 'https://www.mckinsey.com/capabilities/transformation
/our-insights/you-cant-move-too-fast-a-conversation-with-andy-penn', 'https://www.mckinsey.com/capabiliti
es/operations/our-insights/inflation-fighter-and-value-creator-procurements-best-kept-secret', 'https://w
ww.mckinsey.com/capabilities/operations/our-insights/global-infrastructure-initiative/voices/preparing-fo
r-tomorrow-an-interview-with-tariq-taherbhai', 'https://www.mckinsey.com/capabilities/operations/our-insi
ghts/value-speed-and-scale-a-new-era-for-operations-in-asia', 'https://www.mckinsey.com/capabilities/oper
ations/our-insights/stepping-up-what-coos-will-need-to-succeed-in-2023-and-beyond', None, 'https://www.mc
kinsey.com/capabilities/operations/our-insights/digital-twins-what-could-they-do-for-your-business', 'htt
ps://www.mckinsey.com/capabilities/operations/our-insights/global-infrastructure-initiative/voices/voices
-introduction-august-2022', 'https://www.mckinsey.com/capabilities/operations/our-insights/how-good-are-y
our-internal-operations-really', 'https://www.mckinsey.com/capabilities/operations/our-insights/114-down-
10-million-to-go-the-global-lighthouse-networks-mission', 'https://www.mckinsey.com/capabilities/operatio
ns/our-insights/people-and-places-how-and-where-to-work-next', 'https://www.mckinsey.com/capabilities/ope
rations/our-insights/accelerating-capital-projects-to-secure-advantages-in-the-net-zero-transition', 'htt
ps://www.mckinsey.com/capabilities/operations/our-insights/global-infrastructure-initiative/voices/unlock
ing-hydrogens-power-for-long-haul-freight-transport', 'https://www.mckinsey.com/industries/engineering-co
nstruction-and-building-materials/our-insights/how-much-is-a-brick-that-depends', 'https://www.mckinsey.c
om/capabilities/operations/our-insights/global-infrastructure-initiative/voices/mapping-the-way-decarboni
zing-roads', 'https://www.mckinsey.com/capabilities/operations/our-insights/global-infrastructure-initiat
ive/voices/investing-in-pathways-to-decarbonize-infrastructure', 'https://www.mckinsey.com/capabilities/o
perations/our-insights/the-hidden-value-of-voice-conversations-part-2-reaping-the-rewards', 'https://www.
mckinsey.com/capabilities/operations/our-insights/power-spike-how-battery-makers-can-respond-to-surging-d
emand-from-evs', 'https://www.mckinsey.com/capabilities/operations/our-insights/the-care-of-one-hyperpers
onalization-of-customer-care', 'https://www.mckinsey.com/capabilities/operations/our-insights/the-scaling
-imperative-for-industry-4-point-0', 'https://www.mckinsey.com/capabilities/operations/our-insights/utili
ty-procurement-ready-to-meet-new-market-challenges', 'https://www.mckinsey.com/capabilities/operations/ou
r-insights/global-infrastructure-initiative/voices/the-art-of-the-possible-an-interview-with-leaders-from
-scottish-water', 'https://www.mckinsey.com/capabilities/operations/our-insights/converge-it-and-ot-to-tu
rbocharge-business-operations-scaling-power'}

How to scrape all the comments of a youtube video using selenium, python

I want to scrape all the comments of a YouTube video using selenium but able to scrape only first 20. Don't getting what's wrong with the following code -
imports required
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
import time
initialisation
driver = webdriver.Chrome()
url = 'https://www.youtube.com/watch?v=etzmAZ7oiz0'
driver.get(url)
time.sleep(3)
final_comment_list = []
author_list = []
comment_list = []
while loop for scrolling down the page
last_height = driver.execute_script("return document.body.scrollHeight")
html = driver.find_element(By.TAG_NAME, 'html')
while True:
print("Scroll down to bottom")
# Scroll down to bottom
html.send_keys(Keys.PAGE_DOWN)
# Wait to load the page
time.sleep(5)
# find author name and author comment
try:
authors_list_el = driver.find_elements(By.CSS_SELECTOR,
'#author-text.yt-simple-endpoint.style-scope.ytd-comment-renderer span.style-scope.ytd-comment-renderer')
author_list = [x.text for x in authors_list_el]
except:
print(f"not able to find author for {url} video")
try:
comments = driver.find_elements(By.CSS_SELECTOR, '#content.style-scope.ytd-expander')
comment_list = [x.text for x in comments]
except:
print(f"not able to find comments for {url} video")
# creating dictionary object and adding to list
obj1 = dict(author_list=author_list, comment_list=comment_list)
final_comment_list.append(obj1)
# Calculate new scroll height and compare with last scroll height
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
else:
last_height = new_height
printing the result
print(final_comment_list)
print(len(author_list))

Selenium webdriver: How to delete/flag spam comments on YouTube platform (without API)

I've been trying to flag/report a list of spam comments in a particular YouTube video.
For that I've been using this code on Python, which loads my previous profile so I log in with my account:
URL = "https://www.youtube.com/watch?
v=dvecqwfU6xw&lc=Ugxw_nsUNUor9AUEBGp4AaABAg.9fDfvkgiqtW9fDkE2r6Blm"
soup = BeautifulSoup(requests.get(URL).content, "html.parser")
options = webdriver.ChromeOptions()
user = pathlib.Path().home()
print(user)
options.add_argument(f"user-data-dir={user}/AppData/Local/Google/Chrome/User Data/")
driver= webdriver.Chrome('chromedriver.exe',chrome_options=options)
driver.get(URL)
wait=WebDriverWait(driver, 100)
comment_box = '//*[#id="comment"]'
reply_box ='//*[#id="replies"]'
while(True):
driver.execute_script("window.scrollBy(0, 200);")
try:
reply_box = driver.find_element(By.XPATH, reply_box)
print(reply_box.text)
break
except:
pass
# resp = driver.request('POST', 'https://www.youtube.com/youtubei/v1/flag/get_form?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8&prettyPrint=false')
# print(resp.text)
button = wait.until(EC.presence_of_element_located((By.XPATH,'//*[#id="button"]')))
driver.execute_script("arguments[0].click();", button)
The problem comes with opening the menu, I believe since you have to hover over the 3 dots menu it would then appear as the clickable menu so I never get to open the actual menu to report/flag the comment.
My mistake was not to take full Xpath path.... It works perfectly like this, THANKS
options = webdriver.ChromeOptions()
user = pathlib.Path().home()
print(user)
options.add_argument(f"user-data-dir={user}/AppData/Local/Google/Chrome/User Data/")
options.add_argument('--headless')
driver= webdriver.Chrome('chromedriver.exe',chrome_options=options)
driver.get(URL)
wait=WebDriverWait(driver, 100)
comment_box = '//*[#id="comment"]'
reply_box ='//*[#id="replies"]'
while(True):
driver.execute_script("window.scrollBy(0, 200);")
try:
reply_box = driver.find_element(By.XPATH, reply_box)
print(reply_box.text)
break
except:
pass
option_button = '/html/body/ytd-app/div[1]/ytd-page-manager/ytd-watch-flexy/div[5]/div[1]/div/div[2]/ytd-comments/ytd-item-section-renderer/div[3]/ytd-comment-thread-renderer[1]/div/ytd-comment-replies-renderer/div[2]/ytd-comment-renderer/div[3]/div[3]/ytd-menu-renderer/yt-icon-button/button'
option_button = wait.until(EC.presence_of_element_located((By.XPATH, option_button)))
driver.execute_script("arguments[0].click();", option_button)
report_button = '/html/body/ytd-app/ytd-popup-container/tp-yt-iron-dropdown/div/ytd-menu-popup-renderer/tp-yt-paper-listbox/ytd-menu-service-item-renderer/tp-yt-paper-item/yt-formatted-string'
report_button = wait.until(EC.presence_of_element_located((By.XPATH,report_button)))
driver.execute_script("arguments[0].click();", report_button)
report_button_spam = '/html/body/ytd-app/ytd-popup-container/tp-yt-paper-dialog/yt-report-form-modal-renderer/tp-yt-paper-dialog-scrollable/div/div/yt-options-renderer/div/tp-yt-paper-radio-group/tp-yt-paper-radio-button[1]/div[1]'
report_button_spam = wait.until(EC.presence_of_element_located((By.XPATH, report_button_spam)))
driver.execute_script("arguments[0].click();", report_button_spam)
report_button_send = '/html/body/ytd-app/ytd-popup-container/tp-yt-paper-dialog/yt-report-form-modal-renderer/div/yt-button-renderer[2]/a/tp-yt-paper-button'
report_button_send = wait.until(EC.presence_of_element_located((By.XPATH, report_button_send)))
driver.execute_script("arguments[0].click();", report_button_send)
popup_button_done = '/html/body/ytd-app/ytd-popup-container/tp-yt-paper-dialog[2]/yt-confirm-dialog-renderer/div[2]/div[2]/yt-button-renderer[3]/a/tp-yt-paper-button'
popup_button_done = wait.until(EC.presence_of_element_located((By.XPATH, popup_button_done)))
print(popup_button_done.text)

Scraping with selenium and BeautifulSoup doesn´t return all the items in the page

So I came from the question here
Now I am able to interact with the page, scroll down the page, close the popup that appears and click at the bottom to expand the page.
The problem is when I count the items, the code only returns 20 and it should be 40.
I have checked the code again and again - I'm missing something but I don't know what.
See my code below:
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
import time
import datetime
options = webdriver.ChromeOptions()
options.add_argument('--ignore-certificate-errors')
options.add_argument('--incognito')
#options.add_argument('--headless')
driver = webdriver.Chrome(executable_path=r"C:\\chromedriver.exe", options=options)
url = 'https://www.coolmod.com/componentes-pc-procesadores?f=375::No'
driver.get(url)
iter=1
while True:
scrollHeight = driver.execute_script("return document.documentElement.scrollHeight")
Height=10*iter
driver.execute_script("window.scrollTo(0, " + str(Height) + ");")
if Height > scrollHeight:
print('End of page')
break
iter+=1
time.sleep(3)
popup = driver.find_element_by_class_name('confirm').click()
time.sleep(3)
ver_mas = driver.find_elements_by_class_name('button-load-more')
for x in range(len(ver_mas)):
if ver_mas[x].is_displayed():
driver.execute_script("arguments[0].click();", ver_mas[x])
time.sleep(10)
page_source = driver.page_source
soup = BeautifulSoup(page_source, 'lxml')
# print(soup)
items = soup.find_all('div',class_='col-xs-12 col-sm-6 col-sm-6 col-md-6 col-lg-3 col-product col-custom-width')
print(len(items))
````=
What is wrong?. I newbie in the scraping world.
Regards
Your while and for statements don't work as intended.
Using while True: is a bad practice
You scroll until the bottom - but the button-load-more button isn't displayed there - and Selenium will not find it as displayed
find_elements_by_class_name - looks for multiple elements - the page has only one element with that class
if ver_mas[x].is_displayed(): if you are lucky this will be executed only once because the range is 1
Below you can find the solution - here the code looks for the button, moves to it instead of scrolling, and performs a click. If the code fails to found the button - meaning that all the items were loaded - it breaks the while and moves forward.
url = 'https://www.coolmod.com/componentes-pc-procesadores?f=375::No'
driver.get(url)
time.sleep(3)
popup = driver.find_element_by_class_name('confirm').click()
iter = 1
while iter > 0:
time.sleep(3)
try:
ver_mas = driver.find_element_by_class_name('button-load-more')
actions = ActionChains(driver)
actions.move_to_element(ver_mas).perform()
driver.execute_script("arguments[0].click();", ver_mas)
except NoSuchElementException:
break
iter += 1
page_source = driver.page_source
soup = BeautifulSoup(page_source, 'lxml')
# print(soup)
items = soup.find_all('div', class_='col-xs-12 col-sm-6 col-sm-6 col-md-6 col-lg-3 col-product col-custom-width')
print(len(items))

How to scrape the youtube comments with selenium in python?

I am trying to scrape youtube comments so that each row contains the title of the video, author of comment, and comment itself. As seen in the code below I open the drive successfully and get rid of some authentication and cookie messages as well. Scroll enough to get the first comments loaded. After this happens I still am not able to get the comment text by xpath as seen below.
csv_file = open('funda_youtube_comments.csv', 'w', encoding="UTF-8", newline="")
writer = csv.writer(csv_file)
writer.writerow(['title', 'comment', 'author'])
PATH = r"C:\Users\veiza\OneDrive\Desktop\AUAS\University\Quarter 2\Online Data Mining\Project1test\chromedriver.exe"
driver = webdriver.Chrome(PATH)
driver.implicitly_wait(10)
driver.get("https://www.youtube.com/watch?v=VWQaP9txG6M&t=76s")
driver.maximize_window()
time.sleep(2)
driver.execute_script('window.scrollTo(0,700);')
wait = WebDriverWait(driver, 20)
wait.until(EC.presence_of_element_located((By.XPATH, "//div[#id='dismiss-button']"))).click()
time.sleep(2)
WebDriverWait(driver,10).until(EC.frame_to_be_available_and_switch_to_it((By.CSS_SELECTOR,"iframe[src^='https://consent.google.com']")))
WebDriverWait(driver,10).until(EC.element_to_be_clickable((By.XPATH,"//div[#id='introAgreeButton']"))).click()
time.sleep(2)
title = driver.title
print(title)
time.sleep(5)
totalcomments= len(driver.find_elements_by_xpath("""//*[#id="content-text"]"""))
if totalcomments < 50:
index = totalcomments
else:
index = 50
youtube_dict ={}
ccount = 0
while ccount < index:
try:
comment = driver.find_elements_by_xpath('//*[#id="content-text"]')[ccount].text
except:
comment = ""
try:
authors = driver.find_elements_by_xpath('//a[#id="author-text"]/span')[ccount].text
except:
authors = ""
try:
title = title
except:
title = ""
youtube_dict['comment'] = comment
youtube_dict['author'] = authors
youtube_dict['video title'] = title
writer.writerow(youtube_dict.values())
ccount = ccount + 1
print(youtube_dict)
driver.close()
What am I doing wrong?
If you want to make it simple, you can use tube_dl
pip install tube_dl
This module has Comments class that can help you with processing comments.
Here's the simple usage of that:
from tube_dl.comments import Comments
comments = Comments('yt url').process_comments()
#If you want limited comments, you can specify that. Ex : process_comments(count=45)
Feel free to raise issues at github.com/shekharchander/tube_dl. I'll be happy to resolve issues.
I was able to scrape youtube comments. below you can see the solution.
options = Options()
options.add_argument("--headless")
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False)
PATH = r"C:\Users\veiza\OneDrive\Desktop\AUAS\University\Quarter 2\Online Data " \
r"Mining\Project1test\chromedriver.exe "
driver = webdriver.Chrome(executable_path=PATH, options=options)
driver.get(response.url)
time.sleep(5)
try:
title = driver.find_element_by_xpath('//*[#id="container"]/h1/yt-formatted-string').text
comment_section = driver.find_element_by_xpath('//*[#id="comments"]')
except exceptions.NoSuchElementException:
error = "Error: Double check selector OR "
error += "element may not yet be on the screen at the time of the find operation"
print(error)
driver.execute_script("arguments[0].scrollIntoView();", comment_section)
time.sleep(7)
last_height = driver.execute_script("return document.documentElement.scrollHeight")
while True:
driver.execute_script("window.scrollTo(0, document.documentElement.scrollHeight);")
time.sleep(2)
new_height = driver.execute_script("return document.documentElement.scrollHeight")
if new_height == last_height:
break
last_height = new_height
driver.execute_script("window.scrollTo(0, document.documentElement.scrollHeight);")
try:
accounts_elems = driver.find_elements_by_xpath('//*[#id="author-text"]')
comment_elems = driver.find_elements_by_xpath('//*[#id="content-text"]')
except exceptions.NoSuchElementException:
error = "Error: Double check selector OR "
error += "element may not yet be on the screen at the time of the find operation"
print(error)
accounts = [elem.text for elem in accounts_elems]
comments = [elem.text for elem in comment_elems]
for comment_index in range(len(comment_elems)):
yield {
'title': title,
'url': driver.current_url,
'account': accounts[comment_index],
'comment': comments[comment_index]
}

Categories