Extract all links from drop down list combination

Extract all links from drop down list combination - python

I have a sample website and I want to extract all the "href links" from the website. It has two drop downs and once drop down is selected it displays results with link to manual to download.
It does not navigate to different page instead shows result on the same page. I have extracted the combination of drop down lists, I am trying to extract the manual links and I am unable to find the link.
code is as follows
from selenium import webdriver
from selenium.webdriver.support.ui import Select
import time
from bs4 import BeautifulSoup
import requests
url = "https://www.cars.com/"
driver = webdriver.Chrome('C:/Users/webdrivers/chromedriver.exe')
driver.get(url)
time.sleep(4)
selectYear = Select(driver.find_element_by_id("odl-selected-year"))
data = []
for yearOption in selectYear.options:
yearText = yearOption.text
selectYear.select_by_visible_text(yearText)
time.sleep(1)
selectModel = Select(driver.find_element_by_id("odl-selected-model"))
for modelOption in selectModel.options:
modelText = modelOption.text
selectModel.select_by_visible_text(modelText)
data.append([yearText,modelText])
page = requests.get(url, headers=headers)
soup = BeautifulSoup(page.text, 'html.parser')
content = soup.findAll('div',attrs={"class":"odl-results-container"})
for i in content:
x = i.findAll(['h3','span'])
for y in x:
print(y.get_text())
print does not show any data. How can I get the links for manuals? Thanks in advance

You need to click the button for each car model and year and then retrieve the rendered HTML page source from your Selenium webdriver rather than with requests.
Add this in your inner loop:
button = driver.find_element_by_link_text("Select this vehicle")
button.click()
page = driver.page_source
soup = BeautifulSoup(page, 'html.parser')
content = soup.findAll('a',attrs={"class":"odl-download-link"})
for i in content:
print(i["href"])
This prints out:
http://www.fordservicecontent.com/Ford_Content/vdirsnet/OwnerManual/Home/Index?Variantid=6875&languageCode=EN&countryCode=USA&marketCode=US&bookcode=O91668&VIN=&userMarket=GBR
http://www.fordservicecontent.com/Ford_Content/vdirsnet/OwnerManual/Home/Index?Variantid=7126&languageCode=EN&countryCode=USA&marketCode=US&bookcode=O134871&VIN=&userMarket=GBR
http://www.fordservicecontent.com/Ford_Content/vdirsnet/OwnerManual/Home/Index?Variantid=7708&languageCode=EN&countryCode=USA&marketCode=US&bookcode=O177941&VIN=&userMarket=GBR
...

Related

Extracting the titles of the websites mentioned in the link

https://www.g2.com/categories/marketing-automation
I am trying webscrap the above link that has list of 350+ websites i need to extract the title of the websites mentioned
But I am failing to get any results i have tried with using requests and beautiful soup
then with selenium and all i am getting is empty list "[]" or none
import requests
from bs4 import BeautifulSoup
# Send a GET request to the URL and parse the HTML content
url = 'https://www.g2.com/categories/marketing-automation'
response = requests.get(url).text
soup = BeautifulSoup(response, 'html.parser')
name = soup.find(class_ = "product-card__product-name")
print(name)
This above code is just a test code to check if the data is being pulled or not and the response is 'None'
From this code i am expecting to see the results of the class mentioned upon calling print

I kind of got this code to get something. Im still working on it.
from selenium import webdriver
from selenium.webdriver.common.by import By
# Create a new instance of the Chrome driver
driver = webdriver.Chrome()
# Navigate to the webpage
driver.get('https://www.g2.com/categories/marketing-automation')
# Wait for the page to load
driver.implicitly_wait(10)
# Find all the product cards on the page
product_cards = driver.find_elements(By.CLASS_NAME, 'product-card__product-name')
# Iterate over the product cards and extract the title from each one
for product_card in product_cards:
title = product_card.text
print(title)
# Close the browser
driver.quit()

How to scrape hyperlinks from multiple pages on webpage with unchanging URL, using Python/Beautiful Soup?

There is a paginated list of hyperlinks on this webpage: https://www.farmersforum.ie/mart-reports/county-Tipperary-mart/.
The code I have created till now scrapes the relevant links from the first page. I cannot figure out how to extract links from subsequent pages (8 links per page, about 25 pages).
There does not seem to be a way to navigate the pages using the URL.
from bs4 import BeautifulSoup
import urllib.request
# Scrape webpage
parser = 'html.parser' # or 'lxml' (preferred) or 'html5lib', if installed
resp = urllib.request.urlopen("https://www.farmersforum.ie/mart-reports/county-Tipperary-mart/")
soup = BeautifulSoup(resp, parser, from_encoding=resp.info().get_param('charset'))
# Extract links
links = []
for link in soup.find_all('a', href=True):
links.append(link['href'])
# Select relevant links, reformat, and drop duplicates
links = list(dict.fromkeys(["https://www.farmersforum.ie"+link for link in links if "/reports/Thurles" in link]))
Please advise for how I can do this using Python.

I've solved this with Selenium. Thank you.
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
import time
# Launch Chrome driver
driver = webdriver.Chrome(ChromeDriverManager().install())
# Open webpage
driver.get("https://www.farmersforum.ie/mart-reports/county-Tipperary-mart/")
# Loop through pages
allLnks = []
iStop = False
# Continue until fail to find button
while iStop == False:
for ii in range(2,12):
try:
# Click page
driver.find_element_by_xpath('//*[#id="mainContent"]/div/div[1]/div[2]/ul/li['+str(ii)+']/a').click()
except:
iStop = True
break
# Wait to load
time.sleep(0.1)
# Identify elements with tagname <a>
lnks=driver.find_elements_by_tag_name("a")
# Traverse list of links
iiLnks = []
for lnk in lnks:
# Use get_attribute() to get all href and add links to list
iiLnks.append(lnk.get_attribute("href"))
# Select relevant links, reformat, and drop duplicates
iiLnks = list(dict.fromkeys([iiLnk for iiLnk in iiLnks if "/reports/Thurles" in iiLnk]))
allLnks = allLnks + iiLnks
driver.find_element_by_xpath('//*[#id="mainContent"]/div/div[1]/div[2]/ul/li[12]/a').click()
driver.quit()

Python - Item Price Web Scraping for Target

I'm trying to get any item's price from Target website. I did some examples for this website using selenium and Redsky API but now I tried to wrote bs4 code below:
import requests
from bs4 import BeautifulSoup
url = "https://www.target.com/p/ruffles-cheddar-38-sour-cream-potato-chips-2-5oz/-/A-14930847#lnk=sametab"
r= requests.get(url)
soup = BeautifulSoup(r.content, "lxml")
price = soup.find("div",class_= "web-migration-tof__PriceFontSize-sc-14z8sos-14 elGGzp")
print(price)
But it returns me None .
I tried soup.find("div",{'class': "web-migration-tof__PriceFontSize-sc-14z8sos-14 elGGzp"})
What am I missing?
I can accept any selenium code or Redsky API code but my priority is bs4

The page is dynamic. The data is rendered after the initial request is made. You can use selenium to load the page, and once it's rendered, then you can pull out the relevant tag. API though is always the preferred way to go if it's available.
from selenium import webdriver
from bs4 import BeautifulSoup
driver = webdriver.Chrome('C:/chromedriver_win32/chromedriver.exe')
# If you don't want to open a browser, comment out the line above and uncomment below
#options = webdriver.ChromeOptions()
#options.add_argument('headless')
#driver = webdriver.Chrome('C:/chromedriver_win32/chromedriver.exe', options=options)
url = "https://www.target.com/p/ruffles-cheddar-38-sour-cream-potato-chips-2-5oz/-/A-14930847#lnk=sametab"
driver.get(url)
r = driver.page_source
soup = BeautifulSoup(r, "lxml")
price = soup.find("div",class_= "web-migration-tof__PriceFontSize-sc-14z8sos-14 elGGzp")
print(price.text)
Output:
$1.99

You are simply using wrong locator.
Try this
price_css_locator = 'div[data-test=product-price]'
or in XPath style
price_xpath_locator = '//div[#data-test="product-price"]'
With bs4 it should be something like this:
soup.select('div[data-test="product-price"]')
to get the element get you just need to add .text
price = soup.select('div[data-test="product-price"]').text
print(price)

use .text
price = soup.find("div",class_= "web-migration-tof__PriceFontSize-sc-14z8sos-14 elGGzp")
print(price.text)

Links from BeautifulSoup without href or <a>

I am trying to create a bot that scrapes all the image links from a site and store them somewhere else so I can download the images after.
from selenium import webdriver
import time
from bs4 import BeautifulSoup as bs
import requests
url = 'https://www.artstation.com/artwork?sorting=trending'
page = requests.get(url)
driver = webdriver.Chrome()
driver.get(url)
time.sleep(3)
soup = bs(driver.page_source, 'html.parser')
gallery = soup.find_all(class_="image-src")
data = gallery[0]
for x in range(len(gallery)):
print("TAG:", sep="\n")
print(gallery[x], sep="\n")
if page.status_code == 200:
print("Request OK")
This returns all the links tags i wanted but I can't find a way to remove the html or copy only the links to a new list. Here is an example of the tag i get:
<div class="image-src" image-src="https://cdnb.artstation.com/p/assets/images/images/012/269/255/20180810092820/smaller_square/vince-rizzi-batman-n52-p1-a.jpg?1533911301" ng-if="::!project.hide_as_adult"></div>
So, how do i get only the links within the gallery[] list?
What i want to do after is to take this links and edit the /smaller-square/ directory to /large/, which is the one that has the high resolution image.

The page loads it's data through AJAX, so through network inspector we see, where the call is made. This snippet will obtain all the image links found on page 1, sorted by trending:
import requests
import json
url = 'https://www.artstation.com/projects.json?page=1&sorting=trending'
page = requests.get(url)
json_data = json.loads(page.text)
for data in json_data['data']:
print(data['cover']['medium_image_url'])
Prints:
https://cdna.artstation.com/p/assets/images/images/012/272/796/medium/ben-zhang-brigitte-hero-concept.jpg?1533921480
https://cdna.artstation.com/p/assets/covers/images/012/279/572/medium/ham-sung-choul-braveking-140823-1-3-s3-mini.jpg?1533959982
https://cdnb.artstation.com/p/assets/covers/images/012/275/963/medium/michael-vicente-orb-gem-thumb.jpg?1533933774
https://cdnb.artstation.com/p/assets/images/images/012/275/635/medium/michael-kutsche-piglet-by-michael-kutsche.jpg?1533932387
https://cdna.artstation.com/p/assets/images/images/012/273/384/medium/ben-zhang-unnamed.jpg?1533923353
https://cdnb.artstation.com/p/assets/covers/images/012/273/083/medium/michael-vicente-orb-guardian-thumb.jpg?1533922229
... and so on.
If you print the variable json_data, you will see other information the page sends (like icon image url, total_count, data about the author etc.)

You can access the attributes using key-value.
Ex:
from bs4 import BeautifulSoup
s = '''<div class="image-src" image-src="https://cdnb.artstation.com/p/assets/images/images/012/269/255/20180810092820/smaller_square/vince-rizzi-batman-n52-p1-a.jpg?1533911301" ng-if="::!project.hide_as_adult"></div>'''
soup = BeautifulSoup(s, "html.parser")
print(soup.find("div", class_="image-src")["image-src"])
#or
print(soup.find("div", class_="image-src").attrs['image-src'])
Output:
https://cdnb.artstation.com/p/assets/images/images/012/269/255/20180810092820/smaller_square/vince-rizzi-batman-n52-p1-a.jpg?1533911301

Python Selenium accessing HTML source - After Search

Source Code:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
from bs4 import BeautifulSoup
path = "C:\\Python27\\chromedriver\\chromedriver"
driver = webdriver.Chrome(executable_path=path)
# Open Chrome
driver.get("http://www.thehindu.com/")
# 10 Second Delay
time.sleep(10)
elem = driver.find_element_by_id("searchString")
# Enter Keyword
elem.send_keys("unilever")
elem.send_keys(Keys.RETURN)
time.sleep(10)
# Problem Here
page = driver.page_source
soup = BeautifulSoup(page, 'lxml')
print soup
Above it the code.
I want to scrap data from "http://www.thehindu.com/", It searches for "unilever" word in search box and redirect to result page
Link for Search Page
Now I have a question for this, How can I get Source code of the searched Page.
Basically I want news related to "Unilever".

You can get text inside <body>:
body = driver.find_element_by_tag_name("body")
bodyText = body.get_attribute("innerText")
Then you can find your keyword in bodyText.

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Extract all links from drop down list combination - python

Related

Extracting the titles of the websites mentioned in the link

How to scrape hyperlinks from multiple pages on webpage with unchanging URL, using Python/Beautiful Soup?

Python - Item Price Web Scraping for Target

Links from BeautifulSoup without href or <a>

Python Selenium accessing HTML source - After Search

Categories

Resources