Different length to place in a dataframe from scraping

Different length to place in a dataframe from scraping - python

for scraping reactions for a post in a page facebook, i can scrape all the informations (comments, reactions, tags,...) but when i want to put them in a dataframe, a have an error (arrays must all be same length) which is normal because sometime there's someone who put only a comment and an another one only a tag, so i have lists with differents length. i think i can put a conditionnal if but may there another optimized solution...
for example len(tag) =2, len(usr) = 17, len(commentaire)=12.
thanks :)
#imports here
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
import requests
from bs4 import BeautifulSoup
import time
from time import sleep
from lxml import html
import logging as log
import pandas as pd
#chemin de chrome et desactivation des adds automatique de FB anti scrape
chrome_options = webdriver.ChromeOptions()
prefs = {"profile.default_content_setting_values.notifications" : 2}
chrome_options.add_experimental_option("prefs",prefs)
driver = webdriver.Chrome('C:/Users/User/Downloads/chromedriver.exe',
chrome_options=chrome_options)
#open FB
driver.get("http://www.facebook.com")
print ("facebook page log ok")
sleep(1)
#reperage de user et pass (css_selector)
username = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR,
"input[name='email']")))
password = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR,
"input[name='pass']")))
##reperage de user et pass et click (xpath)
#username = driver.find_element(By.XPATH,"//input[contains(#id,'email')]")
#password = driver.find_element(By.XPATH,"//input[contains(#id,'pass')]")
usr=input('Enter Email Id:')
pwd=input('Enter Password:')
#enter données
username.clear()
username.send_keys(usr)
print ("Email Id entered")
sleep(1)
password.clear()
password.send_keys(pwd)
print ("Pass entered")
#reperage bouton log in et click
button = WebDriverWait(driver, 2).until(EC.element_to_be_clickable((By.CSS_SELECTOR,
"button[type='submit']"))).click()
print("login Successfully...")
time.sleep(5)
post = 'https://mbasic.facebook.com/AJSTunisie/posts/6452145678144265'
#open the webpage
driver.get(post)
page = requests.get(post)
df_comm = pd.DataFrame(columns = ['post_url', 'user', 'commentaire', 'tag', 'user_url'])
page_count = 0
while (True ):
#scrape les reactions
tree = html.fromstring(driver.page_source)
user = tree.xpath("//div[#class='eb']/div/h3/a/text()")
commentaire = tree.xpath("//div[#class='eb']/div/div[1]/text()")
tag = tree.xpath("//div[#class='eb']/div/div[1]/a/text()")
user_url = tree.xpath("//div[#class='eb']/div/h3/a/#href")
data= {'post_url':[post]*len(user), 'user':user, 'commentaire':commentaire, 'tag':tag,
'user_url':user_url}
df_comm = df_comm.append(pd.DataFrame(columns = df_comm.columns,data=data))
#Check if more reaction exist ("En afficher davantage" existe ou pas)
next_link = tree.xpath("//div[#class='eb eu']/a/#href")
if len(next_link)!= 0:
driver.find_element_by_xpath("//div[#class='eb eu']/a/#href").click()
page_count = page_count+1
else :
next_link = ''
break
df_comm =df_comm.reset_index()
#df_comm.to_csv(path,index=False)
driver.close()

You should do it in different way.
First you should find all comments - elements with text, user, tag, etc. - and next you should use for-loop to work with every comment separatelly. If loop you should use relavite xpath (starting at .) to get only information for this single comment. And then you can see if you have missing tag or other item and you can put some default value - i.e. empty string.
This way every comment will have all values so every row in CSV will have the same size.
This way also resolve other problem - in previous method you could get first comment with tag from second comment and you couldn't control it.
To make code simpler I put every comment on list of rows and later I convert all to DataFrame.
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from bs4 import BeautifulSoup
import time
from lxml import html
import logging as log
import pandas as pd
#chemin de chrome et desactivation des adds automatique de FB anti scrape
chrome_options = webdriver.ChromeOptions()
prefs = {"profile.default_content_setting_values.notifications" : 2}
chrome_options.add_experimental_option("prefs",prefs)
#driver = webdriver.Chrome('C:/Users/User/Downloads/chromedriver.exe', chrome_options=chrome_options)
driver = webdriver.Chrome(chrome_options=chrome_options)
#open FB
driver.get("http://www.facebook.com")
print ("facebook page log ok")
time.sleep(1)
#reperage de user et pass (css_selector)
username = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR,
"input[name='email']")))
password = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR,
"input[name='pass']")))
##reperage de user et pass et click (xpath)
#username = driver.find_element(By.XPATH,"//input[contains(#id,'email')]")
#password = driver.find_element(By.XPATH,"//input[contains(#id,'pass')]")
usr = input('Enter Email Id:')
pwd = input('Enter Password:')
#enter données
username.clear()
username.send_keys(usr)
print ("Email Id entered")
#time.sleep(1)
password.clear()
password.send_keys(pwd)
print ("Pass entered")
#reperage bouton log in et click
button = WebDriverWait(driver, 2).until(EC.element_to_be_clickable((By.CSS_SELECTOR,
"button[type='submit']"))).click()
print("login Successfully...")
time.sleep(5)
post_url = 'https://mbasic.facebook.com/AJSTunisie/posts/6452145678144265'
#open the webpage
driver.get(post_url)
all_rows = []
page_count = 0
while True:
#scrape les reactions
page_count += 1
print('\n--- page:', page_count, '---\n')
tree = html.fromstring(driver.page_source)
# find all comments
all_comments = tree.xpath("//div[#class='ec']/div")
print('len(all_comments):', len(all_comments))
# work with every comment separatelly
for comment in all_comments:
user = comment.xpath(".//h3/a/text()") # relative xpath starting at `.`
print('user:', user)
user = user[0] if user else "" # set default value
print('user:', user)
commentaire = comment.xpath(".//div[1]/text()") # relative xpath starting at `.`
print('commentaire:', commentaire)
commentaire = commentaire[0] if commentaire else "" # set default value
print('commentaire:', commentaire)
tag = comment.xpath(".//div[1]/a/text()") # relative xpath starting at `.`
print('tag:', tag)
tag = tag[0] if tag else "" # set default value
print('tag:', tag)
user_url = comment.xpath(".//h3/a/#href") # relative xpath starting at `.`
print('user_url:', user_url)
user_url = user_url[0] if user_url else "" # set default value
print('user_url:', user_url)
all_rows.append([post_url, user, commentaire, tag, user_url])
#Check if more reaction exist ("En afficher davantage" existe ou pas)
next_link = driver.find_elements_by_xpath("//div[#class='ec es']/a")
print('---')
print('len(next_link):', len(next_link))
if next_link:
next_link[0].click()
time.sleep(2)
else:
break
# - after loop -
df = pd.DataFrame(all_rows, columns=['post_url', 'user', 'commentaire', 'tag', 'user_url'])
print(df)
df.to_csv('output.csv', index=False)
#driver.close()

Related

Can't localize element in web page for scraping

i'm currently writing a script to crawl linkedin with a bot, but i get a few problems, first here's my code:
import selenium
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
from bs4 import BeautifulSoup, NavigableString, Tag
from time import sleep
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
import csv
import time
import requests
print('- Importation des packages')
# Task 1: Login to Linkedin
driver = webdriver.Chrome(ChromeDriverManager().install())
# Task 1.1: Open Chrome and Access Linkedin login site
sleep(2)
url = 'https://www.linkedin.com/login'
driver.get(url)
print('Initialisation du chrome driver')
sleep(2)
# Task 1.2: Import username and password
credential = open(r"C:\credentials.txt")
line = credential.readlines()
username = line[0]
password = line[1]
print('Importation des id')
sleep(2)
# Task 1.2: Key in login credentials
email_field = driver.find_element(By.ID, 'username')
email_field.send_keys(username)
print('Email ok')
sleep(3)
password_field = driver.find_element(By.NAME, 'session_password')
password_field.send_keys(password)
print('Mdp ok')
sleep(2)
# Task 1.2: Click the Login button
signin_field = driver.find_element(By.XPATH, '//*[#id="organic-div"]/form/div[3]/button')
signin_field.click()
sleep(3)
print('- Task A: Connexion à Linkedin')
search_field = driver.find_element(By.XPATH, '//*[#id="global-nav-typeahead"]/input')
search_query = input('Type of profile to scrape ')
search_field.send_keys(search_query)
search_field.send_keys(Keys.RETURN)
print('TASK B OK')
def GetURL():
page_source = BeautifulSoup(driver.page_source, features='lxml')
profiles = driver.find_elements(By.XPATH,'//*[#id="main"]/div/div/div/ul/li/div/div/div[2]/div[1]/div[1]/div/span/span/a/span/span[1]')
all_profile_URL = []
for profile in profiles:
#profile_ID = profiles.get_attribute('href')
#profile_URL = "https://www.linkedin.com" + profile_ID
profile_URL = profile.get_attribute('href')
print(profile_URL)
if profile not in all_profile_URL:
all_profile_URL.append(profile_URL)
return all_profile_URL
##Pagination
input_page = int(input('Nombre de pages à scraper: '))
URLs_all_page = []
for page in range(input_page):
URLs_one_page = GetURL()
sleep(2)
driver.execute_script('window.scrollTo(0, document.body.scrollHeight);') #scroll to the end of the page
sleep(3)
next_button = driver.find_element(By.CLASS_NAME, "artdeco-button__text")
driver.execute_script("arguments[0].click();", next_button)
if URLs_one_page is not None:
URLs_all_page = URLs_all_page + URLs_one_page
print(URLs_all_page)
else:
print('variable stores a None value')
sleep(2)
print(URLs_all_page)
#Scrappe + stock dans un csv
with open('nouvow.csv', 'w', newline = '') as file_output:
headers = ['Nom_prénom', 'Job', 'Location', 'URL']
writer = csv.DictWriter(file_output, delimiter=',', lineterminator='\n',fieldnames=headers)
writer.writeheader()
for linkedin_URL in URLs_all_page:
driver.get_all(linkedin_URL)
print('- Accès au profile: ', linkedin_URL)
page_source = BeautifulSoup(driver.page_source, "html.parser")
info_div = page_source.find('div',{'class':'flex-1 mr5'})
info_loc = info_div.find_all('ul')
name = info_loc[0].find('li').get_text().strip() #supp les charactères inutiles
print('--- Nom: ', name)
location = info_loc[1].find('li').get_text().strip()
print('--- Localisation :', location)
title = info_div.find('h2').get_text().strip()
print('--- Job: ', title)
writer.writerow({headers[0]:name, headers[1]:location, headers[2]:title, headers[3]:linkedin_URL})
print('\n')
print('Ok final')
Here's my output:
Initialisation du chrome driver
Importation des id
Email ok
Mdp ok
- Task A: Connexion à LinkedinQuels type de profil souhaitez vous scraper?? fullstack blockchain
TASK B OK
Nombre de pages à scraper: 4
[]
None
None
None
None
None
None
None
None
None
None
[None, None, None, None, None, None, None, None, None, None]
So i think that i couldnt localize the profiles and get their links because i have a problem with my xpaths expressions:
[#id="main"]/div/div/div/ul/li/div/div/div[2]/div[1]/div[1]/div/span/span/a/span/span[1]
Plus, it doesnt click on the next button for pagination, and finally i get an error at this line of code:
driver.get_all(linkedin_URL)
I would really appreciate some help with that, thank you

How to paginate and scrape data from an aspx page without selenium

I am trying to scrape the to paginate and scrape each table details from this site.
https://www.cyprusbar.org/CypriotAdvocateMembersPage.aspx
Screenshot
I need to click each details box, get directed to a new window and do it for the other records in each page. Then paginate. Here is my selenium code
import time
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.chrome.options import Options
PATH = 'chromedriver.exe'
options = Options()
options.add_experimental_option('excludeSwitches', ['enable-logging'])
options.add_argument("--lang=en")
driver = webdriver.Chrome(executable_path=PATH, options=options)
driver.maximize_window()
driver.get('https://www.cyprusbar.org/CypriotAdvocateMembersPage.aspx')
driver.find_element_by_xpath('//*[#id="Div1"]/input').click()
def wait(locator, id):
element = WebDriverWait(driver, 50).until(
EC.presence_of_all_elements_located((locator, id))
)
return element
DATA = []
name = '//*[#id="ctl00_ContentPlaceHolder1_TxtName_I"]'
postal = '//*[#id="ctl00_ContentPlaceHolder1_TxtPostalCode_I"]'
fax = '//*[#id="ctl00_ContentPlaceHolder1_TxtFax_I"]'
province = '//*[#id="ctl00_ContentPlaceHolder1_TxtDistrict_I"]'
email = '//*[#id="ctl00_ContentPlaceHolder1_TxtEmail_I"]'
address = '//*[#id="ctl00_ContentPlaceHolder1_TxtAddress_I"]'
phone = '//*[#id="ctl00_ContentPlaceHolder1_TxtPhone_I"]'
courtroom = '//*[#id="ctl00_ContentPlaceHolder1_TxtCourtBox_I"]'
webpage = '//*[#id="ctl00_ContentPlaceHolder1_TxtUrl_I"]'
details = ['Postal Code', 'Fax', 'Calendar Province', 'Email', 'Address', 'Phone', 'Courtroom', 'Webpage']
def gotopage(page):
for p in range(page-1):
next_page = driver.find_element_by_class_name('dxWeb_pNext_Material')
action = ActionChains(driver)
action.click(next_page)
action.perform()
time.sleep(4)
def each_page(page, new):
global DATA
curr = 0
while curr < 80:
if page > 1 and new:
gotopage(page)
action = ActionChains(driver)
action.move_to_element(driver.find_element_by_xpath('//*[#id="ctl00_ContentPlaceHolder1_LawyersGrid_DXPagerBottom_PSI"]')).click()
action.perform()
action.send_keys(Keys.ARROW_UP, Keys.RETURN)
action.perform()
time.sleep(17)
data = {}
action = ActionChains(driver)
detail_list = wait(By.CLASS_NAME, 'dxb-hbc')
try:
action.click(detail_list[curr])
action.perform()
except IndexError:
print(curr)
driver.back()
gotopage(page)
data['Name'] = wait(By.XPATH, name)[0].get_attribute('value')
for i, d in enumerate([postal, fax, province, email, address, phone, courtroom, webpage]):
info = driver.find_element_by_xpath(d).get_attribute(('value'))
data[details[i]] = info
DATA.append(data)
curr += 1
driver.back()
print('============SCRAPING===============')
page = 1
new=True
while page <= 50:
try:
each_page(page, new)
page += 1
except Exception as err:
print(err)
print(page)
The problem here is that this is incredibly slow because each time you say
driver.back()
It goes back to page 1 and I would need to go back to the current page it would need to go back to the page it was in.
Is there anyway i can achieve this with something like BeautifulSoup?

Instagram Comment Scraping, Scrapes Username instead of comment

So I am trying to scrape usernames and comments from multiple posts. Using this code below.
from selenium.webdriver.common.by import By
from selenium import webdriver
import time
import sys
import pandas as pd
from pandas import ExcelWriter
import os.path
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
url=['https://www.instagram.com/p/CRLe53_hmMH','https://www.instagram.com/p/CRX7VL1sL54/?utm_medium=share_sheet',
'https://www.instagram.com/p/CRVB7ykM7-R/?utm_medium=share_sheet', 'https://www.instagram.com/p/CRQ9Bq5M6ce/?utm_medium=share_sheet',
'https://www.instagram.com/p/CRQT1BJMmSi/?utm_medium=share_sheet', 'https://www.instagram.com/p/CM8T3HgMQG0/?utm_medium=copy_link'
'https://www.instagram.com/p/COrn5fYs78O/?utm_medium=share_sheet']
user_names = []
user_comments = []
driver = driver = webdriver.Chrome('E:/chromedriver')
driver.get(url[0])
time.sleep(3)
username = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "input[name='username']")))
password = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "input[name='password']")))
username.clear()
username.send_keys('myuname')
password.clear()
password.send_keys('mypassword')
Login_button = WebDriverWait(driver, 2).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "button[type='submit']"))).click()
time.sleep(4)
not_now = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, '//button[contains(text(), "Not Now")]'))).click()
not_now2 = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, '//button[contains(text(), "Not Now")]'))).click()
for n in url:
try:
driver.get(n)
time.sleep(3)
load_more_comment = driver.find_element_by_class_name('glyphsSpriteCircle_add__outline__24__grey_9')
print("Found {}".format(str(load_more_comment)))
i = 0
while load_more_comment.is_displayed() and i < 10:
load_more_comment.click()
time.sleep(1.5)
load_more_comment = driver.find_element_by_class_name('glyphsSpriteCircle_add__outline__24__grey_9')
print("Found {}".format(str(load_more_comment)))
i += 1
user_names.pop(0)
user_comments.pop(0)
except Exception as e:
print(e)
pass
comment = driver.find_elements_by_class_name('gElp9 ')
for c in comment:
container = c.find_element_by_class_name('C4VMK')
name = container.find_element_by_class_name('_6lAjh ').text
content = container.find_element_by_tag_name('span').text
content = content.replace('\n', ' ').strip().rstrip()
user_names.append(name)
user_comments.append(content)
print(content)
user_names.pop(0)
user_comments.pop(0)
#export(user_names, user_comments)
driver.close()
df = pd.DataFrame(list(zip(user_names, user_comments)),
columns =['Name', 'Comments'])
#df.to_excel('ujicoba_gabung_IG_6.xlsx')
print(df)
But somehow instead of returning username and comment, both user_names and user_comments return usernames. Where did I make a mistake?
Here Are My outputs
I think my problem is on the for loop where I declare the container as C4VMK. But I inspected the element on Instagram it is already the same

There are two span in C4VMK class. First in h3 -> first div -> span and second is that one you want.
For getting the second span that is the comment, replace your code with below and get the second element.
content = container.find_elements_by_tag_name('span')[1].text

Your container is correct. However, when you search for a span by tag name like this:
content = container.find_element_by_tag_name('span').text
Selenium will find the first span that is under the content. Which in this case is the username span with the class 'Jv7Aj mArmR MqpiF '.
What you are looking for is the other span that I highlighted in the image, which is a direct child of the container with an empty class.
You can select it like this:
content = container.find_element_by_xpath("/span[#class='']")

Can't get HTML source of a Javascript Generated page with selenium

So I got this page (https://www.ssn.gob.ar/storage/registros/productores/productoresactivosfiltro.asp) from where I want to extract data.
You can get the data of the people by only putting numbers in the "Matricula" field, that part is easy, but when it generates the new page and I want to get get the data from a specific div it gives you NONE, and checking the HTML it use to browse the data, it's the same as the page I'm putting the numbers to access the data.
import os
import time
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
def clear(): return os.system("cls")
options = webdriver.ChromeOptions()
options.add_argument('--start-maximized')
options.add_argument('--disable-extensions')
driver_path = 'C:\\Users\\Menem Lo Hizo\\Downloads\\chromedriver_win32\\chromedriver.exe'
driver = webdriver.Chrome(driver_path, chrome_options=options)
driver.get('https://www.ssn.gob.ar/storage/registros/productores/productoresactivosfiltro.asp')
matricula = driver.find_element_by_id("matricula")
matricula.send_keys("2")
matricula.send_keys(Keys.RETURN)
try:
div = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CLASS_NAME, "col-md-8 col-md-offset-2"))
)
except:
driver.quit()
clear()
print(div)
This is my code.

Logging ones network traffic when submitting the form reveals an HTTP POST request being made to productoresactivos.asp, the response of which is HTML. Simply imitate that request:
def get_columns():
import requests
from bs4 import BeautifulSoup as Soup
url = "https://www.ssn.gob.ar/storage/registros/productores/productoresactivos.asp"
payload = {
"socpro": "PAS",
"matricula": "2",
"apellidorazonsocial": "",
"docNro": "",
"Submit": "Buscar"
}
response = requests.post(url, data=payload)
response.raise_for_status()
soup = Soup(response.content, "html.parser")
for column in soup.select("div[class^=\"col-md-\"]"):
yield " ".join(column.get_text().strip().split())
def main():
for text in get_columns():
print(text)
return 0
if __name__ == "__main__":
import sys
sys.exit(main())
Output:
Página 1 de 1
Matrícula: 2
Nombre: CABELLO DE GADANO, MARIA CRISTINA
Documento: DNI - 5263977
CUIT: 27-05263977-3
Ramo: PATRIMONIALES Y VIDA
Domicilio: AV. CORDOBA 669 12º B
Localidad: CIUDAD AUTONOMA BS.AS.
Provincia CIUDAD AUTONOMA
Cod. Postal: 1054
Teléfonos: 4311-5860
E-mail:
Nro. de Resolución 17053
Fº de Resolución 06/01/1983
Nro. de Libro: 01
Nro. de Rubrica: 20395
Fº. de Rubrica: 21/08/1992
Nro. de Libro: 1
Fº. de Rubrica: 20396
Fº. de Rubrica: 21/08/1992
>>>

few things :
You need explicit waits
When You hit enter on first page, a new tabs opens up, you need to switch to that windows
Code :
driver.get("https://www.ssn.gob.ar/storage/registros/productores/productoresactivosfiltro.asp")
wait = WebDriverWait(driver, 10)
org_handles = driver.window_handles
wait.until(EC.element_to_be_clickable((By.ID, "matricula"))).send_keys("2" + Keys.RETURN)
new_handles = driver.window_handles
driver.switch_to.window(new_handles[1])
div = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, ".col-md-8.col-md-offset-2")))
print(div.text)

How to print the open pdf link from using Selenium in Python?

I am not able to print the link of the final pdf which is opening after running the given code
from selenium import webdriver
from selenium.webdriver.support import ui
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException
def page_is_loaded(driver):
return driver.find_element_by_tag_name("body")!= None
def check_exists_by_text(text):
try:
driver.find_element_by_link_text(text)
except NoSuchElementException:
return False
return True
driver = webdriver.Chrome("C:/Users/Roshan/Desktop/sbi/chromedriver")
driver.maximize_window()
driver.get("http://www.careratings.com/brief-rationale.aspx")
wait = ui.WebDriverWait(driver,10)
wait.until(page_is_loaded)
location_field = driver.find_element_by_name("txtfromdate")
location_field.send_keys("2019-05-06")
last_date = driver.find_element_by_name("txttodate")
last_date.send_keys("2019-05-21")
driver.find_element_by_xpath("//input[#name='btn_submit']").click()
if check_exists_by_text('Reliance Capital Limited'):
elm =driver.find_element_by_link_text('Reliance Capital Limited')
driver.implicitly_wait(5)
elm.click()
driver.implicitly_wait(50)
#time.sleep(5)
#driver.quit()
else :
print("Company is not rated in the given Date range")
I am expecting the actual output is the link of this pdf :
"http://www.careratings.com/upload/CompanyFiles/PR/Reliance%20Capital%20Ltd.-05-18-2019.pdf"
but I do not know how to print this link

You need to find all elements in table, then extract data from them.
from selenium import webdriver
import os
# setup path to chrome driver
chrome_driver = os.getcwd() + '/chromedriver'
# initialise chrome driver
browser = webdriver.Chrome(chrome_driver)
# load url
browser.get('http://www.careratings.com/brief-rationale.aspx')
# setup date range
location_field = browser.find_element_by_name("txtfromdate")
location_field.send_keys("2019-05-06")
last_date = browser.find_element_by_name("txttodate")
last_date.send_keys("2019-05-21")
browser.find_element_by_xpath("//input[#name='btn_submit']").click()
# get all data rows
content = browser.find_elements_by_xpath('//*[#id="divManagementSpeak"]/table/tbody/tr/td/a')
# get text and href link from each element
collected_data = []
for item in content:
url = item.get_attribute("href")
description = item.get_attribute("innerText")
collected_data.append((url, description ))
Output:
('http://www.careratings.com/upload/CompanyFiles/PR/Ashwini%20Frozen%20Foods-05-21-2019.pdf', 'Ashwini Frozen Foods')
('http://www.careratings.com/upload/CompanyFiles/PR/Vanita%20Cold%20Storage-05-21-2019.pdf', 'Vanita Cold Storage')
and so on

I would say you just need to put this line:
pdf_link = elm.get_attribute("href")

Just check out the below image. You have missed one important part to click on. When you enter some text in that inputbox, there is a dropdown projected downward displaying the search results available in their stock to choose from. Once you click on that, the rest are as it is.
Try the following script:
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
url = "http://www.careratings.com/brief-rationale.aspx"
with webdriver.Chrome() as driver:
driver.get(url)
wait = WebDriverWait(driver,10)
location_field = wait.until(EC.presence_of_element_located((By.NAME, "txtfromdate")))
location_field.send_keys("2019-05-06")
last_date = wait.until(EC.presence_of_element_located((By.NAME, "txttodate")))
last_date.send_keys("2019-05-21")
input_search = wait.until(EC.presence_of_element_located((By.NAME, "txtSearchCompany_brief")))
input_search.send_keys('Reliance Capital Limited')
time.sleep(3) #could not get rid of this hardcoded delay to make the script work
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,"[onclick*='Reliance Capital Limited']"))).click()
# time.sleep(2) #activate this line in case the script behaves otherwise
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,"input[name='btn_submit']"))).click()
for item in wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR,"table tr td > a[href$='.pdf']"))):
print(item.get_attribute("href"))

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Different length to place in a dataframe from scraping - python

Related

Can't localize element in web page for scraping

How to paginate and scrape data from an aspx page without selenium

Instagram Comment Scraping, Scrapes Username instead of comment

Can't get HTML source of a Javascript Generated page with selenium

How to print the open pdf link from using Selenium in Python?

Categories

Resources