Extracting job description linkedin - python

i tried to extract job title, company, location and description.
example of job page
i managed to get job title, company, location since each span has their class name.
i struggle to get job description because they don't have class for their span. part of their description also located in a list. i tried to extract text using absolute xpath it doesn't work.
from io import StringIO
from parsel import Selector
from time import sleep
from selenium.webdriver.edge.service import Service
from selenium import webdriver
driver = webdriver.Edge('C:/Users/users/Downloads/edgedriver_win64/msedgedriver.exe')
#accessing linkedin
driver.get('https://www.linkedin.com')
# login
username = driver.find_element_by_name('session_key')
username.send_keys(parameter.email)
password = driver.find_element_by_name('session_password')
password.send_keys(parameter.password)
submit = driver.find_element_by_class_name('sign-in-form__submit-button')
submit.click()
sleep(2)
driver.get(parameter.siteQuery)
sleep(5)
wait = WebDriverWait(driver, 20)
links = driver.find_elements_by_xpath("//a[#class='disabled ember-view job-card-container__link']")
links =[link.get_attribute("href") for link in links]
sleep(1)
for link in links :
driver.get(links)
sleep(5)
# moreinfo =driver.find_element_by_class_name('artdeco-card__action')
# moreinfo.click()
sel= Selector(text=driver.page_source)
title = sel.xpath('//h1[#class="t-24 t-bold"]/text()').extract()
company = sel.xpath('//span[#class="jobs-unified-top-card__company-name"]/text()').extract()
location = sel.xpath('//span[#class="jobs-unified-top-card__bullet"]/text()').extract()
description = sel.xpath('/html/body/div[6]/div[3]/div/div[1]/div[1]/div/div[2]/article/div/div[1]/span/text()').extract()
i tried extracting straight form div also didn't work
description = sel.xpath('//*[#id="jobs-details"]/span/text()').extract()
i use parsel.Selector and Selenium
any idea how to get the whole description?
thanks before

i figure it out after i realize i can just extract the whole html elements. so i just take the whole span and then clean them later
Extracting from span :
descriptions = sel.xpath('//*[#id="job-details"]').extract()
listDescriptions.append(descriptions[0])
Cleaning the data :
listDescriptions2=[]
for description in listDescriptions :
description = BeautifulSoup(description,features='html.parser').text
description = re.sub(r'\n', '', description)
listDescriptions2.append(description)

Related

Extracting the titles of the websites mentioned in the link

https://www.g2.com/categories/marketing-automation
I am trying webscrap the above link that has list of 350+ websites i need to extract the title of the websites mentioned
But I am failing to get any results i have tried with using requests and beautiful soup
then with selenium and all i am getting is empty list "[]" or none
import requests
from bs4 import BeautifulSoup
# Send a GET request to the URL and parse the HTML content
url = 'https://www.g2.com/categories/marketing-automation'
response = requests.get(url).text
soup = BeautifulSoup(response, 'html.parser')
name = soup.find(class_ = "product-card__product-name")
print(name)
This above code is just a test code to check if the data is being pulled or not and the response is 'None'
From this code i am expecting to see the results of the class mentioned upon calling print
I kind of got this code to get something. Im still working on it.
from selenium import webdriver
from selenium.webdriver.common.by import By
# Create a new instance of the Chrome driver
driver = webdriver.Chrome()
# Navigate to the webpage
driver.get('https://www.g2.com/categories/marketing-automation')
# Wait for the page to load
driver.implicitly_wait(10)
# Find all the product cards on the page
product_cards = driver.find_elements(By.CLASS_NAME, 'product-card__product-name')
# Iterate over the product cards and extract the title from each one
for product_card in product_cards:
title = product_card.text
print(title)
# Close the browser
driver.quit()

Scraping data with Python from reCAPTCHA protecred website

I've been trying to scrape some info for personal use from a website. It works nice, no errors, but I found out it somehow can't see email addresses from second half of the site. Code I'm using:
import requests
from bs4 import BeautifulSoup
page = requests.get('https://rejestradwokatow.pl/adwokat/abramowicz-joanna-49486')
soup = BeautifulSoup(page.content, "html.parser")
kancelaria = [x.strip() for x in soup.find(
'div', class_='mb_tab_content special_one').find_all('div')[::2][0].text.split('\n') if x != ''][1:]
with result:
>>> kancelaria
['Kancelaria Adwokacka', 'Chlebnicka 48/51', '80-830 GdaƄsk', '', 'Stacjonarny/Fax: 583054010', 'Email: [email\xa0protected]']
Please take notice in last element: 'Email: [email\xa0protected]'. I believe it has something to do with reCAPTCHA mechanism implemented in the website, but I have no idea how to go around it. Interesting - emails from first half of the site are visible for my program and can be scraped. Anh thoughts?
EDIT:
I'm reffering to the lower part of the page:
I'm going to add another answer to this:
That one is created by Javascript, and you may test it using Selenium. The code is provided below.
from selenium import webdriver
import chromedriver_autoinstaller
# auto install chromedriver
chromedriver = chromedriver_autoinstaller.install()
# driver define and lunch
driver = webdriver.Chrome(chromedriver)
driver.maximize_window()
# Go to website and get email
url = 'https://rejestradwokatow.pl/adwokat/artymiak-grzegorz-46439'
driver.get(url)
email_text = driver.find_element_by_xpath('//div[#class="mb_tab_content special_one"]/div[#class="line_list_K"]').text.split('Email: ')
email = email_text[-1]
print(email)
gartymiak#protonmail.com
The email is generated with CSS. You have to extract attribute values in div data-ea and data-eb and join with #
name = soup.find('div', class_="address_e").get('data-ea')
domain = soup.find('div', class_="address_e").get('data-eb')
email = f'{name}#{domain}'
To get just emails, enter the following:
email_1st_part = soup.find('div', class_="address_e").get('data-ea')
email_2nd_part = soup.find('div', class_="address_e").get('data-eb')
email = email_1st_part + '#'+ email_2nd_part
Full Code:
import requests
from bs4 import BeautifulSoup
page = requests.get('https://rejestradwokatow.pl/adwokat/abramowicz-joanna-49486')
soup = BeautifulSoup(page.content, "html.parser")
email_1st_part = soup.find('div', class_="address_e").get('data-ea')
email_2nd_part = soup.find('div', class_="address_e").get('data-eb')
email = email_1st_part + '#'+ email_2nd_part
Result:
print(email)
'abramowicz#pro.onet.pl'

Extract all links from drop down list combination

I have a sample website and I want to extract all the "href links" from the website. It has two drop downs and once drop down is selected it displays results with link to manual to download.
It does not navigate to different page instead shows result on the same page. I have extracted the combination of drop down lists, I am trying to extract the manual links and I am unable to find the link.
code is as follows
from selenium import webdriver
from selenium.webdriver.support.ui import Select
import time
from bs4 import BeautifulSoup
import requests
url = "https://www.cars.com/"
driver = webdriver.Chrome('C:/Users/webdrivers/chromedriver.exe')
driver.get(url)
time.sleep(4)
selectYear = Select(driver.find_element_by_id("odl-selected-year"))
data = []
for yearOption in selectYear.options:
yearText = yearOption.text
selectYear.select_by_visible_text(yearText)
time.sleep(1)
selectModel = Select(driver.find_element_by_id("odl-selected-model"))
for modelOption in selectModel.options:
modelText = modelOption.text
selectModel.select_by_visible_text(modelText)
data.append([yearText,modelText])
page = requests.get(url, headers=headers)
soup = BeautifulSoup(page.text, 'html.parser')
content = soup.findAll('div',attrs={"class":"odl-results-container"})
for i in content:
x = i.findAll(['h3','span'])
for y in x:
print(y.get_text())
print does not show any data. How can I get the links for manuals? Thanks in advance
You need to click the button for each car model and year and then retrieve the rendered HTML page source from your Selenium webdriver rather than with requests.
Add this in your inner loop:
button = driver.find_element_by_link_text("Select this vehicle")
button.click()
page = driver.page_source
soup = BeautifulSoup(page, 'html.parser')
content = soup.findAll('a',attrs={"class":"odl-download-link"})
for i in content:
print(i["href"])
This prints out:
http://www.fordservicecontent.com/Ford_Content/vdirsnet/OwnerManual/Home/Index?Variantid=6875&languageCode=EN&countryCode=USA&marketCode=US&bookcode=O91668&VIN=&userMarket=GBR
http://www.fordservicecontent.com/Ford_Content/vdirsnet/OwnerManual/Home/Index?Variantid=7126&languageCode=EN&countryCode=USA&marketCode=US&bookcode=O134871&VIN=&userMarket=GBR
http://www.fordservicecontent.com/Ford_Content/vdirsnet/OwnerManual/Home/Index?Variantid=7708&languageCode=EN&countryCode=USA&marketCode=US&bookcode=O177941&VIN=&userMarket=GBR
...

gathering *ALL* links from given web page and then searching them by phrase

I am new to python language. As mentioned in the title, I am trying to obtain all the links from this web page:
Trying to find all links using python script from this page: https://web.archive.org/web/*/http://bankier.pl/
Links, which interests me the most are included in sections like this
year - month - week - day
The best option to find a specific phrase on all (or some range ) of years/months/weeks/days links, or at least to download the links and 'grep' it localy.
So far I have tried using beautifulSoup and selenium, but never was even close to gather what interests me the most.
It seemed like the links were on "lower levels" and my script could not gather them.
Much appreciate your help.
Here are my attempts:
#############################
######bs4 script#############
#############################
import requests
from bs4 import BeautifulSoup
url = 'https://web.archive.org/web/*/http://bankier.pl/'
r = requests.get(url)
html_content = r.text
soup = BeautifulSoup(html_content, 'lxml')
links = [a.get('href') for a in soup.find_all('a', href=True)]
print(links)
and this:
#############################
######selenium script########
#############################
from selenium import webdriver
driver = webdriver.Chrome('/home/grzegorz/Documents/Techniczne/Skrypty/Python/Vulture/niewywalac/chromedriver')
driver.get('https://web.archive.org/web/*/http://bankier.pl/')
links = driver.find_elements_by_xpath('.//span[#class="file"]/a')
len(links)
for link in links:
link.get_attribute('href')
...but none of these prints the link I need to have ( so http://www.bankier.pl web page from specific date).
Unfortunately, right now I can't see a way of getting the urls provided when you hover the mouse over a day, but by doing so you can at least get the first urls from each available scrapped day:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
driver = webdriver.Chrome()
url = 'https://web.archive.org/web/*/http://bankier.pl/'
driver.get(url)
WebDriverWait(driver, 20).until(
EC.presence_of_element_located((By.XPATH, """//*[#id="wb-calendar"]
""")))
captures = driver.find_elements_by_xpath("""//*[#id="wb-
calendar"]/div/div/div/div/div/div/div/a
""")
for capture in captures:
print capture.get_attribute("href")
driver.quit()
Let me know if this helps.
Good luck!
You can use below code which was written in Java to get your respective output.
driver.get("https://web.archive.org/web/*/http://bankier.pl/");
List<String> url = new ArrayList<String>();
List<WebElement> link = driver.findElements(By.cssSelector(".month-week .month-day div div[class='captures'] a"));
for(WebElement Url : link) {
url.add(Url.getAttribute("href"));
}
*system.out.println(url.size())* // get total link
Iterator itr=url.iterator();
while(itr.hasNext()){
System.out.println(itr.next());
}
...but none of these prints the link I need
you are doing this:
for link in links:
link.get_attribute('href')
which will print exactly nothing when running as a script.
did you mean something like this?:
for link in links:
href = link.get_attribute('href')
print(href)

Python Selenium pull href info out of find_elements_by_partial_link_text

Im working on pulling some data from a website, I can successfully surf to the page that lists all the updated data from the day before, but now I need to iterate through all the links, and save the source of each page to a file.
Once in a file I want to use BeautifulSoup to better arrange the data so I can parse through it.
#learn.py
from BeautifulSoup import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
url1 = 'https://odyssey.tarrantcounty.com/default.aspx'
date = '07/31/2014'
option_by_date = "6"
driver = webdriver.Firefox()
driver.get(url1)
continue_link = driver.find_element_by_partial_link_text('Case')
#follow link
continue_link.click()
driver.find_element_by_xpath("//select[#name='SearchBy']/option[text()='Date Filed']").click()
#fill in dates in form
from_date = driver.find_element_by_id("DateFiledOnAfter")
from_date.send_keys(date)
to_date = driver.find_element_by_id("DateFiledOnBefore")
to_date.send_keys(date)
submit_button = driver.find_element_by_id('SearchSubmit')
submit_button.click()
link_list = driver.find_elements_by_partial_link_text('2014')
link_list should be a list of the applicable links, but I'm not sure where to go from there.
Get all links that have href attribute starting with CaseDetail.aspx?CaseID=, find_elements_by_xpath() would help here:
# get the list of links
links = [link.get_attribute('href')
for link in driver.find_elements_by_xpath('//td/a[starts-with(#href, "CaseDetail.aspx?CaseID=")]')]
for link in links:
# follow the link
driver.get(link)
# parse the data
print driver.find_element_by_class_name('ssCaseDetailCaseNbr').text
Prints:
Case No. 2014-PR01986-2
Case No. 2014-PR01988-1
Case No. 2014-PR01989-1
...
Note that you don't need to save the pages and parse them via BeautifulSoup. Selenium itself is pretty powerful in navigating and extracting the data out of the webpages.
You can fetch web elements using their tag name. If you want to fetch all the links in a web page, I would use find_elements_by_tag_name().
links = driver.find_elements_by_tag_name('a')
link_urls = [link.get_attribute('href') for link in links]
source_dict = dict()
for url in link_urls:
driver.get(url)
source = driver.page_source #this will give you page source
source_dict[url] = source
#source_dict dictionary will contain the source code you wanted for each url with the url as the key.

Categories