How do i get only unique href elements in selenium in python - python

I am trying to extract all the href links in an anchor tag using selenium for my web scraping project in python.
I have multiple pages on a single page and I am trying to access the href elements for a single page.
Below is the code:
url = "https://www.carwale.com/used/cars-for-sale/#sc=-1&so=-1&pn=1"
driver.get(url)
links=driver.find_elements_by_xpath('//*[#href]')
for l in links:
print(l.get_attribute('href'))
On running my code the same href element gets printed multiple times.
Snippet of Output of the code:
https://www.carwale.com/used/cars-in-chennai/ford-figo-2010-2012-d2115418/?slot=4&rk=1&isP=true
https://www.carwale.com/used/cars-in-chennai/ford-figo-2010-2012-d2115418/?slot=4&rk=1&isP=true
https://www.carwale.com/used/cars-in-chennai/ford-figo-2010-2012-d2115418/?slot=4&rk=1&isP=true
How do I get it to print only once?

Do something like:
url = "https://www.carwale.com/used/cars-for-sale/#sc=-1&so=-1&pn=1"
driver.get(url)
processed = []
links = driver.find_elements_by_xpath('//*[#href]')
for link in links:
if link not in processed:
print(link.get_attribute('href'))
processed.append(link)
else:
continue

Related

How to scrape hyperlinks from multiple pages on webpage with unchanging URL, using Python/Beautiful Soup?

There is a paginated list of hyperlinks on this webpage: https://www.farmersforum.ie/mart-reports/county-Tipperary-mart/.
The code I have created till now scrapes the relevant links from the first page. I cannot figure out how to extract links from subsequent pages (8 links per page, about 25 pages).
There does not seem to be a way to navigate the pages using the URL.
from bs4 import BeautifulSoup
import urllib.request
# Scrape webpage
parser = 'html.parser' # or 'lxml' (preferred) or 'html5lib', if installed
resp = urllib.request.urlopen("https://www.farmersforum.ie/mart-reports/county-Tipperary-mart/")
soup = BeautifulSoup(resp, parser, from_encoding=resp.info().get_param('charset'))
# Extract links
links = []
for link in soup.find_all('a', href=True):
links.append(link['href'])
# Select relevant links, reformat, and drop duplicates
links = list(dict.fromkeys(["https://www.farmersforum.ie"+link for link in links if "/reports/Thurles" in link]))
Please advise for how I can do this using Python.
I've solved this with Selenium. Thank you.
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
import time
# Launch Chrome driver
driver = webdriver.Chrome(ChromeDriverManager().install())
# Open webpage
driver.get("https://www.farmersforum.ie/mart-reports/county-Tipperary-mart/")
# Loop through pages
allLnks = []
iStop = False
# Continue until fail to find button
while iStop == False:
for ii in range(2,12):
try:
# Click page
driver.find_element_by_xpath('//*[#id="mainContent"]/div/div[1]/div[2]/ul/li['+str(ii)+']/a').click()
except:
iStop = True
break
# Wait to load
time.sleep(0.1)
# Identify elements with tagname <a>
lnks=driver.find_elements_by_tag_name("a")
# Traverse list of links
iiLnks = []
for lnk in lnks:
# Use get_attribute() to get all href and add links to list
iiLnks.append(lnk.get_attribute("href"))
# Select relevant links, reformat, and drop duplicates
iiLnks = list(dict.fromkeys([iiLnk for iiLnk in iiLnks if "/reports/Thurles" in iiLnk]))
allLnks = allLnks + iiLnks
driver.find_element_by_xpath('//*[#id="mainContent"]/div/div[1]/div[2]/ul/li[12]/a').click()
driver.quit()

Selenium - Web page returns a "redirecting" link insted of original link with get_attribute('href')

I'm trying to get a link from a page using Selenium. When checking the page's source code I can clearly see the original link, but when I use Selenium to select the element, and then use element.get_attribute('href'), the link that it returns is a different one.
# Web page url request
driver.get('https://www.facebook.com/ads/library/?active_status=all&ad_type=all&country=BR&q=myshopify&sort_data[direction]=desc&sort_data[mode]=relevancy_monthly_grouped&search_type=keyword_unordered&media_type=all')
driver.maximize_window()
time.sleep(10)
v_link = driver.find_element(By.XPATH, '//*[#id="facebook"]/body/div[5]/div[2]/div/div/div/div/div[3]/span/div[1]/div/div[2]/div[1]/div[2]/div[3]/a')
print(v_link.get_attribute('href'))
The actual link that I need: https://bhalliproducts.store/?_pos=1&_sid=8a26757f5&_ss=r
The link being returned: https://l.facebook.com/l.php?u=https%3A%2F%2Fbhalliproducts.store%2F%3F_pos%3D1%26_sid%3D8a26757f5%26_ss%3Dr&h=AT3KkXQbOn5s3oaaaCV2vjaAnyJqEqkIlqvP16g3eCsCnw-fx3VCNMR66_Zxs50v9JU5JK2DLABhoBHRNHQENH6oyp39Pho2Z6o25NZD5RIvl5kMow0lfd2rdaUWp11e6alEJFtoJp0X_uXgp5B2OYocRg5wGA
You can use the following solution:
from urllib.parse import unquote
href = "https://l.facebook.com/l.php?u=https%3A%2F%2Fbhalliproducts.store%2F%3F_pos%3D1%26_sid%3D8a26757f5%26_ss%3Dr&h=AT3KkXQbOn5s3oaaaCV2vjaAnyJqEqkIlqvP16g3eCsCnw-fx3VCNMR66_Zxs50v9JU5JK2DLABhoBHRNHQENH6oyp39Pho2Z6o25NZD5RIvl5kMow0lfd2rdaUWp11e6alEJFtoJp0X_uXgp5B2OYocRg5wGA"
begin = href.find('=') + 1
end = href.find('&')
href = href[begin:end]
href = unquote(href)
print(href)

How to get only valid url links while parsing a page with BeautifulSoup?

I am trying to get a list of pages that a page is linking to (using tag). When parsing the page I use BeautifulSoup:
page = opener.open(url)
soup = BeautifulSoup(page.read(), features='lxml')
links = soup.findAll("a", href=True)
for link in links:
validLink = bool(re.match(r'^(?:https?:\/\/)?(?:[^#\/\n]+#)?(?:www\.)?([^:\/\n]+)', link["href"]))
if validLink:
myset.append(link["href"])
This way it looks for <a href> tags to find the links and return the url of the linked page. But the resulting urls in myset look like this:
How can I filter the links that have hash sign in them? Also if I wanted to filter ad links or videos,... which element of the tag should I use?
For link that start with http or https (you only need to state the http) you can use attribute selector with start with operator
links = [item['href'] for item in soup.select('[href^=http]')]
For ad links/video - do you wish to include or exclude? We will need to see the relevant html. Is there an url to use?

Get ALL links from driver.find_elements by href not working

New to python and selenium webdriver. I am trying to check all the links on my own webpage and use it's http status code to see if it is a broken link or not. The code that I am running (reduced from original)...
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import requests
links = driver.find_elements_by_xpath("//a[#href]")
while len(links):
url = links.pop()
url = url.get_attribute("href")
print(url)
The html looks like...
<ul>
<li>visit google</li>
<li>broken link ex</li>
</ul>
When I run my script, the only link that gets printed is the google link and not the broken link. I have done some test cases and it seems that only the links that include the phrase "http://www" in the link get printed. Although I can change the href links on my webpage to include this phrase, I have specific reasons as to why they cannot be included.
If I can just get all the links (with or without the "http://www" phrase) using driver.find_elements_by_xpath("//a[#href]"), then I can convert these later in the script to include the phrase and then get the http status codes.
I saw other posts but none that helped me get over this obstacle. Any clarification/workaround/hint would be appreciated.
the following list comprehension should get you a list of all links. It locates all anchor tags and generates a list containing the 'href' attribute of each element.
links = [elem.get_attribute("href") for elem in driver.find_elements_by_tag_name('a')]
here is same thing broken down into small steps and used as a function:
def get_all_links(driver):
links = []
elements = driver.find_elements_by_tag_name('a')
for elem in elements:
href = elem.get_attribute("href")
links.append(href)
return links

Python Selenium pull href info out of find_elements_by_partial_link_text

Im working on pulling some data from a website, I can successfully surf to the page that lists all the updated data from the day before, but now I need to iterate through all the links, and save the source of each page to a file.
Once in a file I want to use BeautifulSoup to better arrange the data so I can parse through it.
#learn.py
from BeautifulSoup import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
url1 = 'https://odyssey.tarrantcounty.com/default.aspx'
date = '07/31/2014'
option_by_date = "6"
driver = webdriver.Firefox()
driver.get(url1)
continue_link = driver.find_element_by_partial_link_text('Case')
#follow link
continue_link.click()
driver.find_element_by_xpath("//select[#name='SearchBy']/option[text()='Date Filed']").click()
#fill in dates in form
from_date = driver.find_element_by_id("DateFiledOnAfter")
from_date.send_keys(date)
to_date = driver.find_element_by_id("DateFiledOnBefore")
to_date.send_keys(date)
submit_button = driver.find_element_by_id('SearchSubmit')
submit_button.click()
link_list = driver.find_elements_by_partial_link_text('2014')
link_list should be a list of the applicable links, but I'm not sure where to go from there.
Get all links that have href attribute starting with CaseDetail.aspx?CaseID=, find_elements_by_xpath() would help here:
# get the list of links
links = [link.get_attribute('href')
for link in driver.find_elements_by_xpath('//td/a[starts-with(#href, "CaseDetail.aspx?CaseID=")]')]
for link in links:
# follow the link
driver.get(link)
# parse the data
print driver.find_element_by_class_name('ssCaseDetailCaseNbr').text
Prints:
Case No. 2014-PR01986-2
Case No. 2014-PR01988-1
Case No. 2014-PR01989-1
...
Note that you don't need to save the pages and parse them via BeautifulSoup. Selenium itself is pretty powerful in navigating and extracting the data out of the webpages.
You can fetch web elements using their tag name. If you want to fetch all the links in a web page, I would use find_elements_by_tag_name().
links = driver.find_elements_by_tag_name('a')
link_urls = [link.get_attribute('href') for link in links]
source_dict = dict()
for url in link_urls:
driver.get(url)
source = driver.page_source #this will give you page source
source_dict[url] = source
#source_dict dictionary will contain the source code you wanted for each url with the url as the key.

Categories