LinkedIn profile info scraping using Selenium - python

I am trying to scrape profiles from LinkedIn, I get profile URLs from the below code and want to pass it to driver.get(URL), however when I scrape URLs the format of URLs is different, e.g it is in [ ] brackets and I get this error
selenium.common.exceptions.InvalidArgumentException: Message: invalid
argument: 'url' must be a string
Could you please suggest how to get the proper format of URLs in the list linklist = [ ] so I can pass them to driver.get(URL). Thanks!
options = Options()
options.add_argument("--start-maximized")
options.headless = True
url = "https://www.linkedin.com/login?fromSignIn=true&trk=guest_homepage-basic_nav-header-signin"
driver = webdriver.Chrome(path, options=options)
driver.get(url)
driver.find_element_by_id('username').send_keys('name')
driver.find_element_by_id('password').send_keys('password', Keys.ENTER)
driver.implicitly_wait(10)
driver.find_element_by_class_name('search-global-typeahead__input').send_keys('Marketing manager', Keys.ENTER)
driver.implicitly_wait(10)
driver.find_element_by_xpath('//button[text()="People"]').click()
x = 0
profile = []
linklist = []
condition = True
while condition:
sleep(2)
driver.execute_script("window.scrollTo(0, 1400);")
driver.implicitly_wait(10)
linkedin_members = driver.find_elements_by_xpath('//span[#class="entity-result__title"]')
links = [linkedin_member.find_element_by_xpath('.//a[#class="app-aware-link"]').get_attribute('href') for linkedin_member in linkedin_members if "/in/" in linkedin_member.find_element_by_xpath('.//a[#class="app-aware-link"]').get_attribute('href')]
x = x + 1
linklist.append(link for link in links)
driver.implicitly_wait(10)
driver.find_element_by_xpath("""//button[#class='artdeco-pagination__button artdeco-pagination__button--next artdeco-button artdeco-button--muted artdeco-button--icon-right artdeco-button--1 artdeco-button--tertiary ember-view' and contains(.,'Next')]""").click()
if x == 2:
condition = False
profile = []
for l in tqdm(linklist):
driver.get(l)

I used a for loop instad of the while loop you used, because there are no variable condition, you only want to do the loop twice.
Here's how you can do it:
linklist = []
for i in range(2):
time.sleep(2)
driver.execute_script("window.scrollTo(0, 1400);")
driver.implicitly_wait(10)
linkedin_members = driver.find_elements_by_xpath('//span[#class="entity-result__title"]')
link = driver.find_element_by_class_name('app-aware-link').get_attribute('href')
linklist.append(link)
driver.implicitly_wait(10)
driver.find_element_by_xpath("""//button[#class='artdeco-pagination__button artdeco-pagination__button--next artdeco-button artdeco-button--muted artdeco-button--icon-right artdeco-button--1 artdeco-button--tertiary ember-view' and contains(.,'Next')]""").click()
for url in linklist:
driver.get(url)
I searched the class that contains the profile url and used ".get_attribute('href')" to extract the url.

Related

Getting text from multiple webpages(Pagination) in selenium python

I wanted to extract text from multiple pages. Currently, I am able to extract data from the first page but I want to append and go to muliple pages and extract the data from pagination. I have written this simple code which extracts data from the first page. I am not able to extract the data from multiple pages which is dynamic in number.
`
element_list = []
opts = webdriver.ChromeOptions()
opts.headless = True
driver = webdriver.Chrome(ChromeDriverManager().install())
base_url = "XYZ"
driver.maximize_window()
driver.get(base_url)
driver.set_page_load_timeout(50)
element = WebDriverWait(driver, 50).until(EC.presence_of_element_located((By.ID, 'all-my-groups')))
l = []
l = driver.find_elements_by_xpath("//div[contains(#class, 'alias-wrapper sim-ellipsis sim-list--shortId')]")
for i in l:
print(i.text)
`
I have shared the images of class if this could help from pagination.
If we could extract the automate and extract from all the pages that would be awesome. Also, I am new so please pardon me for asking silly questions. Thanks in advance.
You have provided the code just for the previous page button. I guess you need to go to the next page until next page exists. As I don't know what site we are talking about I can only guess its behavior. So I'm assuming the button 'next' disappears when no next page exists. If so, it can be done like this:
element_list = []
opts = webdriver.ChromeOptions()
opts.headless = True
driver = webdriver.Chrome(ChromeDriverManager().install())
base_url = "XYZ"
driver.maximize_window()
driver.get(base_url)
driver.set_page_load_timeout(50)
element = WebDriverWait(driver, 50).until(EC.presence_of_element_located((By.ID, 'all-my-groups')))
l = []
l = driver.find_elements_by_xpath("//div[contains(#class, 'alias-wrapper sim-ellipsis sim-list--shortId')]")
while True:
try:
next_page = driver.find_element(By.XPATH, '//button[#label="Next page"]')
except NoSuchElementException:
break
next_page.click()
l.extend(driver.find_elements(By.XPATH, "//div[contains(#class, 'alias-wrapper sim-ellipsis sim-list--shortId')]"))
for i in l:
print(i.text)
To be able to catch the exception this import has to be added:
from selenium.common.exceptions import NoSuchElementException
Also note that the method find_elements_by_xpath is deprecated and it would be better to replace this line:
l = driver.find_elements_by_xpath("//div[contains(#class, 'alias-wrapper sim-ellipsis sim-list--shortId')]")
by this one:
l = driver.find_elements(By.XPATH, "//div[contains(#class, 'alias-wrapper sim-ellipsis sim-list--shortId')]")

Selenium webdriver: How to delete/flag spam comments on YouTube platform (without API)

I've been trying to flag/report a list of spam comments in a particular YouTube video.
For that I've been using this code on Python, which loads my previous profile so I log in with my account:
URL = "https://www.youtube.com/watch?
v=dvecqwfU6xw&lc=Ugxw_nsUNUor9AUEBGp4AaABAg.9fDfvkgiqtW9fDkE2r6Blm"
soup = BeautifulSoup(requests.get(URL).content, "html.parser")
options = webdriver.ChromeOptions()
user = pathlib.Path().home()
print(user)
options.add_argument(f"user-data-dir={user}/AppData/Local/Google/Chrome/User Data/")
driver= webdriver.Chrome('chromedriver.exe',chrome_options=options)
driver.get(URL)
wait=WebDriverWait(driver, 100)
comment_box = '//*[#id="comment"]'
reply_box ='//*[#id="replies"]'
while(True):
driver.execute_script("window.scrollBy(0, 200);")
try:
reply_box = driver.find_element(By.XPATH, reply_box)
print(reply_box.text)
break
except:
pass
# resp = driver.request('POST', 'https://www.youtube.com/youtubei/v1/flag/get_form?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8&prettyPrint=false')
# print(resp.text)
button = wait.until(EC.presence_of_element_located((By.XPATH,'//*[#id="button"]')))
driver.execute_script("arguments[0].click();", button)
The problem comes with opening the menu, I believe since you have to hover over the 3 dots menu it would then appear as the clickable menu so I never get to open the actual menu to report/flag the comment.
My mistake was not to take full Xpath path.... It works perfectly like this, THANKS
options = webdriver.ChromeOptions()
user = pathlib.Path().home()
print(user)
options.add_argument(f"user-data-dir={user}/AppData/Local/Google/Chrome/User Data/")
options.add_argument('--headless')
driver= webdriver.Chrome('chromedriver.exe',chrome_options=options)
driver.get(URL)
wait=WebDriverWait(driver, 100)
comment_box = '//*[#id="comment"]'
reply_box ='//*[#id="replies"]'
while(True):
driver.execute_script("window.scrollBy(0, 200);")
try:
reply_box = driver.find_element(By.XPATH, reply_box)
print(reply_box.text)
break
except:
pass
option_button = '/html/body/ytd-app/div[1]/ytd-page-manager/ytd-watch-flexy/div[5]/div[1]/div/div[2]/ytd-comments/ytd-item-section-renderer/div[3]/ytd-comment-thread-renderer[1]/div/ytd-comment-replies-renderer/div[2]/ytd-comment-renderer/div[3]/div[3]/ytd-menu-renderer/yt-icon-button/button'
option_button = wait.until(EC.presence_of_element_located((By.XPATH, option_button)))
driver.execute_script("arguments[0].click();", option_button)
report_button = '/html/body/ytd-app/ytd-popup-container/tp-yt-iron-dropdown/div/ytd-menu-popup-renderer/tp-yt-paper-listbox/ytd-menu-service-item-renderer/tp-yt-paper-item/yt-formatted-string'
report_button = wait.until(EC.presence_of_element_located((By.XPATH,report_button)))
driver.execute_script("arguments[0].click();", report_button)
report_button_spam = '/html/body/ytd-app/ytd-popup-container/tp-yt-paper-dialog/yt-report-form-modal-renderer/tp-yt-paper-dialog-scrollable/div/div/yt-options-renderer/div/tp-yt-paper-radio-group/tp-yt-paper-radio-button[1]/div[1]'
report_button_spam = wait.until(EC.presence_of_element_located((By.XPATH, report_button_spam)))
driver.execute_script("arguments[0].click();", report_button_spam)
report_button_send = '/html/body/ytd-app/ytd-popup-container/tp-yt-paper-dialog/yt-report-form-modal-renderer/div/yt-button-renderer[2]/a/tp-yt-paper-button'
report_button_send = wait.until(EC.presence_of_element_located((By.XPATH, report_button_send)))
driver.execute_script("arguments[0].click();", report_button_send)
popup_button_done = '/html/body/ytd-app/ytd-popup-container/tp-yt-paper-dialog[2]/yt-confirm-dialog-renderer/div[2]/div[2]/yt-button-renderer[3]/a/tp-yt-paper-button'
popup_button_done = wait.until(EC.presence_of_element_located((By.XPATH, popup_button_done)))
print(popup_button_done.text)

How to list all links of google search result with selenium-python?

driver.get("https://www.google.com")
search = driver.find_element_by_name("q")
search.clear()
search.send_keys("tesla")
search.send_keys(Keys.RETURN)
time.sleep(3)
url_list = []
results_list = driver.find_elements_by_tag_name('a')
for url in results_list:
if url == None:
pass
else:
url_list.append(url.get_attribute("href"))
I want to capture screenshots of all websites that are results of google search. However, with this code, my program grab "videos, shopping, news, Images" buttons links. But, I just want to grab resulted links. How can I do this?
It's so much different but it's the exact thing that you want :)
for i in range(len(results_list)):
results_list[i] = results_list[i].text.replace(">", "/").replace("›", "/").replace(" ", "")
if not validators.url(results_list[i]):
results_list[i] = ''
results_list = list(filter(None, results_list))
print(results_list)
Full code:
from selenium import webdriver
import validators
fireFoxOptions = webdriver.FirefoxOptions()
fireFoxOptions.add_argument('--headless')
driver = webdriver.Firefox(options=fireFoxOptions, executable_path='./geckodriver.exe')
driver.get("https://www.google.com/search?q=tesla")
results_list = driver.find_elements_by_tag_name('cite')
for i in range(len(results_list)):
results_list[i] = results_list[i].text.replace(">", "/").replace("›", "/").replace(" ", "")
if not validators.url(results_list[i]):
results_list[i] = ''
results_list = list(filter(None, results_list))
print(results_list)
driver.quit()

How to re-load page while looping over elements?

This is my code, should be easily recreateable:
import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
def main():
# Setup chrome options
chrome_options = Options()
chrome_options.add_argument("--headless") # Ensure GUI is off
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--window-size=1920x3500")
# Set path to chromedriver as per your configuration
webdriver_service = Service("/home/sumant/chromedriver/stable/chromedriver")
# Choose Chrome Browser
driver = webdriver.Chrome(service=webdriver_service, options=chrome_options)
driver.maximize_window()
# Get page
url = "https://www.ibrance.com/"
driver.get(url)
time.sleep(2)
ele = driver.find_elements_by_tag_name('a')
for i, e in enumerate(ele):
try:
print(e.get_attribute('outerHTML'))
e.click()
time.sleep(2)
driver.save_screenshot(f"/mnt/d/Work/ss{i}.png")
driver.get(url)
# driver.refresh()
except:
print("element not interactable")
driver.close()
driver.quit()
if __name__ == '__main__':
main()
The idea is I click on a link take a screenshot, load home page again, click on next link and so on.
After the first link, it is not able to find any other element on the reloaded page.
This is correct, since after the refresh it is unable to find you required elements.
To do so, elements need to be reloaded after each refresh.
Do this:
ele = driver.find_elements_by_tag_name('a')
for i, e in enumerate(ele):
try:
print(e.get_attribute('outerHTML'))
e.click()
time.sleep(2)
driver.save_screenshot(f"/mnt/d/Work/ss{i}.png")
driver.get(url)
driver.refresh()
# reload elements
ele = driver.find_elements_by_tag_name('a')
So this worked
(Thanks YuMa, for the inspiration)
def main():
# ...
# Get page
url = "https://www.ibrance.com/"
driver.get(url)
time.sleep(2)
total_element = driver.find_elements_by_tag_name('a')
total_clicks = len(total_element)
def get_images(ele, i):
try:
ele[i].click()
time.sleep(2)
# driver.save_screenshot(f"/mnt/d/Work/ss{i}.png")
print(driver.title)
driver.get(url)
time.sleep(2)
except:
print("")
for i in range(0, total_clicks+1):
ele = driver.find_elements_by_tag_name('a')
get_images(ele, i)

Getting youtube channel name into comment

I have wanting to get the selenium script to comment on a video with a specific keyword.
But I want to make it say the channel name too, can someone please help me with that if possible thanks.
(I know the {}s should not be in there but it will give you an idea on where to put the channel name)
(the channel name would go between the {}s btw)
here is the code.
import time
import os
from bs4 import BeautifulSoup
from selenium import webdriver
def youtube_login(email,password):
op = webdriver.ChromeOptions()
op.binary_location = os.environ.get("GOOGLE_CHROME_BIN")
#op.add_argument('--headless')
op.add_argument('--disable-dev-shm-usage')
op.add_argument('--no-sandbox')
driver = webdriver.Chrome()
driver.get('https://accounts.google.com/ServiceLogin?hl=en&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fhl%3Den%26feature%3Dsign_in_button%26app%3Ddesktop%26action_handle_signin%3Dtrue%26next%3D%252F&uilel=3&passive=true&service=youtube#identifier')
driver.find_element_by_id('identifierId').send_keys(email)
driver.find_element_by_id('identifierNext').click()
time.sleep(3)
#WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, 'div#password input[name="password"]')))
driver.find_element_by_css_selector('div#password input[name="password"]').send_keys(password)
time.sleep(4)
driver.find_element_by_id('passwordNext').click()
return driver
def comment_page(driver,urls,comment):
if len( urls ) == 0:
print ('Youtube Comment Bot: Finished!')
return []
url = urls.pop()
driver.get(url)
print(url)
driver.implicitly_wait(1)
if not check_exists_by_xpath(driver,'//*[#id="movie_player"]'):
return comment_page(driver, urls, random_comment())
time.sleep(4)
driver.execute_script("window.scrollTo(0, 600);")
if not check_exists_by_xpath(driver,'//*[#id="simple-box"]/ytd-comment-simplebox-renderer'):
return comment_page(driver, urls, random_comment())
if check_exists_by_xpath(driver,'//*[#id="contents"]/ytd-message-renderer'):
return comment_page(driver, urls, random_comment())
WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.CSS_SELECTOR, "ytd-comments ytd-comment-simplebox-renderer")))
driver.find_element_by_css_selector("ytd-comments ytd-comment-simplebox-renderer div#placeholder-area").click()
driver.implicitly_wait(5)
driver.find_element_by_xpath('//*[#id="contenteditable-root"]').send_keys(comment)
driver.find_element_by_xpath('//*[#id="contenteditable-root"]').send_keys(Keys.CONTROL, Keys.ENTER)
post = WebDriverWait(driver, 15).until(
EC.element_to_be_clickable((By.CSS_SELECTOR,'ytd-comments ytd-comment-simplebox-renderer'))
)
post.click()
r = np.random.randint(2,5)
time.sleep(r)
return comment_page(driver, urls, random_comment())
def random_comment():
# You can edit these lines=======
messages = [
'sup {channel name here}, I loved this video lol cant wait to see more :D'
]
# ===============================
r = np.random.randint(0, len(messages))
return messages[r]
def check_exists_by_xpath(driver,xpath):
try:
driver.find_element_by_xpath(xpath)
except NoSuchElementException:
return False
return True
if __name__ == '__main__':
# You should edit these lines=======
email = 'Youremail#gmail.com'
password = 'Yourpassword'
# ==================================
urls = [
'https://www.youtube.com/watch?v=Szww2_VqEKs&t',
]
inp = open("url.txt","r")
for line in inp.readlines():
urls.append(line)
driver = youtube_login(email, password)
comment_page(driver,urls,random_comment())
Here is the locator to get the channel name:
//div[contains(#class, 'channel-name')]//a
All you have to do now is to getText and replace it into your string.

Categories